@huggingface/transformers 3.2.3 → 3.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/transformers.cjs +203 -92
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +203 -92
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +203 -92
- package/dist/transformers.mjs.map +1 -1
- package/package.json +2 -2
- package/src/base/feature_extraction_utils.js +9 -9
- package/src/base/image_processors_utils.js +11 -0
- package/src/base/processing_utils.js +13 -3
- package/src/configs.js +5 -0
- package/src/env.js +1 -1
- package/src/models/auto/feature_extraction_auto.js +0 -16
- package/src/models/auto/processing_auto.js +0 -16
- package/src/models/convnext/image_processing_convnext.js +1 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
- package/src/models/florence2/processing_florence2.js +3 -0
- package/src/models/idefics3/image_processing_idefics3.js +2 -0
- package/src/models/janus/image_processing_janus.js +1 -0
- package/src/models/mgp_str/processing_mgp_str.js +2 -0
- package/src/models/paligemma/processing_paligemma.js +1 -0
- package/src/models/phi3_v/processing_phi3_v.js +1 -1
- package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
- package/src/models/whisper/feature_extraction_whisper.js +1 -1
- package/src/models.js +50 -15
- package/src/ops/registry.js +10 -0
- package/src/pipelines.js +34 -7
- package/src/tokenizers.js +4 -7
- package/src/utils/dtypes.js +2 -0
- package/src/utils/hub.js +1 -1
- package/src/utils/maths.js +8 -6
- package/src/utils/tensor.js +42 -10
- package/types/base/feature_extraction_utils.d.ts +7 -7
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +17 -19
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
- package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
- package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
- package/types/models/whisper/generation_whisper.d.ts +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models.d.ts +32 -17
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +2 -2
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/tsconfig.tsbuildinfo +1 -0
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +3 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +8 -6
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +8 -4
- package/types/utils/tensor.d.ts.map +1 -1
package/dist/transformers.mjs
CHANGED
|
@@ -4132,23 +4132,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
|
|
|
4132
4132
|
}
|
|
4133
4133
|
|
|
4134
4134
|
/**
|
|
4135
|
-
* Instantiate one of the
|
|
4135
|
+
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
4136
4136
|
*
|
|
4137
|
-
* The
|
|
4138
|
-
*
|
|
4137
|
+
* The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
4138
|
+
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
4139
4139
|
*
|
|
4140
4140
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
4141
|
-
* - A string, the *model id* of a pretrained
|
|
4141
|
+
* - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
|
|
4142
4142
|
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
4143
4143
|
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
4144
|
-
* - A path to a *directory* containing
|
|
4145
|
-
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the
|
|
4144
|
+
* - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
|
|
4145
|
+
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
|
|
4146
4146
|
*
|
|
4147
|
-
* @returns {Promise<FeatureExtractor>} A new instance of the
|
|
4147
|
+
* @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
|
|
4148
4148
|
*/
|
|
4149
4149
|
static async from_pretrained(pretrained_model_name_or_path, options) {
|
|
4150
|
-
const
|
|
4151
|
-
return new this(
|
|
4150
|
+
const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
|
|
4151
|
+
return new this(config);
|
|
4152
4152
|
}
|
|
4153
4153
|
}
|
|
4154
4154
|
|
|
@@ -4798,14 +4798,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
4798
4798
|
this.do_thumbnail = config.do_thumbnail;
|
|
4799
4799
|
this.size = config.size ?? config.image_size;
|
|
4800
4800
|
this.do_resize = config.do_resize ?? (this.size !== undefined);
|
|
4801
|
+
// @ts-expect-error TS2339
|
|
4801
4802
|
this.size_divisibility = config.size_divisibility ?? config.size_divisor;
|
|
4802
4803
|
|
|
4803
4804
|
this.do_center_crop = config.do_center_crop;
|
|
4805
|
+
// @ts-expect-error TS2339
|
|
4804
4806
|
this.crop_size = config.crop_size;
|
|
4807
|
+
// @ts-expect-error TS2339
|
|
4805
4808
|
this.do_convert_rgb = config.do_convert_rgb ?? true;
|
|
4809
|
+
// @ts-expect-error TS2339
|
|
4806
4810
|
this.do_crop_margin = config.do_crop_margin;
|
|
4807
4811
|
|
|
4812
|
+
// @ts-expect-error TS2339
|
|
4808
4813
|
this.pad_size = config.pad_size;
|
|
4814
|
+
// @ts-expect-error TS2339
|
|
4809
4815
|
this.do_pad = config.do_pad;
|
|
4810
4816
|
|
|
4811
4817
|
if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
|
|
@@ -5014,6 +5020,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5014
5020
|
// Support both formats for backwards compatibility
|
|
5015
5021
|
else if (Number.isInteger(size)) {
|
|
5016
5022
|
shortest_edge = size;
|
|
5023
|
+
// @ts-expect-error TS2339
|
|
5017
5024
|
longest_edge = this.config.max_size ?? shortest_edge;
|
|
5018
5025
|
|
|
5019
5026
|
} else if (size !== undefined) {
|
|
@@ -5082,6 +5089,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5082
5089
|
} else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
|
|
5083
5090
|
// Custom resize logic for Qwen2-VL models
|
|
5084
5091
|
const { min_pixels, max_pixels } = size;
|
|
5092
|
+
// @ts-expect-error TS2339
|
|
5085
5093
|
const factor = this.config.patch_size * this.config.merge_size;
|
|
5086
5094
|
return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
|
|
5087
5095
|
} else {
|
|
@@ -5097,6 +5105,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5097
5105
|
async resize(image) {
|
|
5098
5106
|
const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
|
|
5099
5107
|
return await image.resize(newWidth, newHeight, {
|
|
5108
|
+
// @ts-expect-error TS2322
|
|
5100
5109
|
resample: this.resample,
|
|
5101
5110
|
});
|
|
5102
5111
|
}
|
|
@@ -5147,6 +5156,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5147
5156
|
|
|
5148
5157
|
// Resize the image using thumbnail method.
|
|
5149
5158
|
if (this.do_thumbnail) {
|
|
5159
|
+
// @ts-expect-error TS2345
|
|
5150
5160
|
image = await this.thumbnail(image, this.size, this.resample);
|
|
5151
5161
|
}
|
|
5152
5162
|
|
|
@@ -5171,6 +5181,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5171
5181
|
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
|
|
5172
5182
|
// occurs with data in the hwc format (height, width, channels),
|
|
5173
5183
|
// to emulate the behavior of the original Python code (w/ numpy).
|
|
5184
|
+
/** @type {Float32Array} */
|
|
5174
5185
|
let pixelData = Float32Array.from(image.data);
|
|
5175
5186
|
let imgDims = [image.height, image.width, image.channels];
|
|
5176
5187
|
|
|
@@ -5328,6 +5339,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
5328
5339
|
/**
|
|
5329
5340
|
* @typedef {Object} ProcessorProperties Additional processor-specific properties.
|
|
5330
5341
|
* @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
|
|
5342
|
+
* @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
|
|
5331
5343
|
*/
|
|
5332
5344
|
|
|
5333
5345
|
|
|
@@ -5361,7 +5373,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5361
5373
|
}
|
|
5362
5374
|
|
|
5363
5375
|
/**
|
|
5364
|
-
* @returns {
|
|
5376
|
+
* @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
|
|
5365
5377
|
*/
|
|
5366
5378
|
get tokenizer() {
|
|
5367
5379
|
return this.components.tokenizer;
|
|
@@ -5374,6 +5386,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5374
5386
|
return this.components.feature_extractor;
|
|
5375
5387
|
}
|
|
5376
5388
|
|
|
5389
|
+
/**
|
|
5390
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
|
|
5391
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
|
|
5392
|
+
* @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
|
|
5393
|
+
*/
|
|
5377
5394
|
apply_chat_template(messages, options = {}) {
|
|
5378
5395
|
if (!this.tokenizer) {
|
|
5379
5396
|
throw new Error('Unable to apply chat template without a tokenizer.');
|
|
@@ -5384,6 +5401,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5384
5401
|
});
|
|
5385
5402
|
}
|
|
5386
5403
|
|
|
5404
|
+
/**
|
|
5405
|
+
* @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
|
|
5406
|
+
* @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
|
|
5407
|
+
*/
|
|
5387
5408
|
batch_decode(...args) {
|
|
5388
5409
|
if (!this.tokenizer) {
|
|
5389
5410
|
throw new Error('Unable to decode without a tokenizer.');
|
|
@@ -5411,8 +5432,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5411
5432
|
/**
|
|
5412
5433
|
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
5413
5434
|
*
|
|
5414
|
-
* The processor class to instantiate is selected based on the `
|
|
5415
|
-
* (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
5435
|
+
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
5436
|
+
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
5416
5437
|
*
|
|
5417
5438
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
5418
5439
|
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
@@ -5531,15 +5552,19 @@ function getNormalizedConfig(config) {
|
|
|
5531
5552
|
case 'florence2':
|
|
5532
5553
|
case 'llava_onevision':
|
|
5533
5554
|
case 'idefics3':
|
|
5555
|
+
// @ts-expect-error TS2339
|
|
5534
5556
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
5535
5557
|
break;
|
|
5536
5558
|
case 'moondream1':
|
|
5559
|
+
// @ts-expect-error TS2339
|
|
5537
5560
|
init_normalized_config = getNormalizedConfig(config.phi_config);
|
|
5538
5561
|
break;
|
|
5539
5562
|
case 'musicgen':
|
|
5563
|
+
// @ts-expect-error TS2339
|
|
5540
5564
|
init_normalized_config = getNormalizedConfig(config.decoder);
|
|
5541
5565
|
break;
|
|
5542
5566
|
case 'multi_modality':
|
|
5567
|
+
// @ts-expect-error TS2339
|
|
5543
5568
|
init_normalized_config = getNormalizedConfig(config.language_config);
|
|
5544
5569
|
break;
|
|
5545
5570
|
|
|
@@ -5660,6 +5685,7 @@ function getNormalizedConfig(config) {
|
|
|
5660
5685
|
break;
|
|
5661
5686
|
|
|
5662
5687
|
case 'vision-encoder-decoder':
|
|
5688
|
+
// @ts-expect-error TS2339
|
|
5663
5689
|
const decoderConfig = getNormalizedConfig(config.decoder);
|
|
5664
5690
|
|
|
5665
5691
|
const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
|
|
@@ -5902,7 +5928,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
5902
5928
|
|
|
5903
5929
|
|
|
5904
5930
|
|
|
5905
|
-
const VERSION = '3.2.
|
|
5931
|
+
const VERSION = '3.2.4';
|
|
5906
5932
|
|
|
5907
5933
|
// Check if various APIs are available (depends on environment)
|
|
5908
5934
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -8558,8 +8584,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
8558
8584
|
} else if (session_options.externalData !== undefined) {
|
|
8559
8585
|
externalDataPromises = session_options.externalData.map(async (ext) => {
|
|
8560
8586
|
// if the external data is a string, fetch the file and replace the string with its content
|
|
8587
|
+
// @ts-expect-error TS2339
|
|
8561
8588
|
if (typeof ext.data === "string") {
|
|
8589
|
+
// @ts-expect-error TS2339
|
|
8562
8590
|
const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
|
|
8591
|
+
// @ts-expect-error TS2698
|
|
8563
8592
|
return { ...ext, data: ext_buffer };
|
|
8564
8593
|
}
|
|
8565
8594
|
return ext;
|
|
@@ -9807,6 +9836,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
9807
9836
|
if (this.config.model_type === 'musicgen') {
|
|
9808
9837
|
// Custom logic (TODO: move to Musicgen class)
|
|
9809
9838
|
decoder_input_ids = Array.from({
|
|
9839
|
+
// @ts-expect-error TS2339
|
|
9810
9840
|
length: batch_size * this.config.decoder.num_codebooks
|
|
9811
9841
|
}, () => [decoder_start_token_id]);
|
|
9812
9842
|
|
|
@@ -10136,11 +10166,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
10136
10166
|
async encode_image({ pixel_values }) {
|
|
10137
10167
|
// image_inputs === { pixel_values }
|
|
10138
10168
|
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
|
|
10169
|
+
// @ts-expect-error TS2339
|
|
10139
10170
|
if (!this.config.num_image_tokens) {
|
|
10140
10171
|
console.warn(
|
|
10141
10172
|
'The number of image tokens was not set in the model configuration. ' +
|
|
10142
10173
|
`Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
|
|
10143
10174
|
)
|
|
10175
|
+
// @ts-expect-error TS2339
|
|
10144
10176
|
this.config.num_image_tokens = features.dims[1];
|
|
10145
10177
|
}
|
|
10146
10178
|
return features;
|
|
@@ -11568,6 +11600,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
11568
11600
|
|
|
11569
11601
|
if (generation_config.return_token_timestamps) {
|
|
11570
11602
|
outputs["token_timestamps"] = this._extract_token_timestamps(
|
|
11603
|
+
// @ts-expect-error TS2345
|
|
11571
11604
|
outputs,
|
|
11572
11605
|
generation_config.alignment_heads,
|
|
11573
11606
|
generation_config.num_frames,
|
|
@@ -11603,6 +11636,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
11603
11636
|
);
|
|
11604
11637
|
}
|
|
11605
11638
|
|
|
11639
|
+
// @ts-expect-error TS2339
|
|
11606
11640
|
let median_filter_width = this.config.median_filter_width;
|
|
11607
11641
|
if (median_filter_width === undefined) {
|
|
11608
11642
|
console.warn("Model config has no `median_filter_width`, using default value of 7.")
|
|
@@ -11613,6 +11647,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
11613
11647
|
const batch = generate_outputs.cross_attentions;
|
|
11614
11648
|
// Create a list with `decoder_layers` elements, each a tensor of shape
|
|
11615
11649
|
// (batch size, attention_heads, output length, input length).
|
|
11650
|
+
// @ts-expect-error TS2339
|
|
11616
11651
|
const cross_attentions = Array.from({ length: this.config.decoder_layers },
|
|
11617
11652
|
// Concatenate the cross attentions for each layer across sequence length dimension.
|
|
11618
11653
|
(_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
|
|
@@ -11756,6 +11791,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
|
|
|
11756
11791
|
attention_mask,
|
|
11757
11792
|
}) {
|
|
11758
11793
|
|
|
11794
|
+
// @ts-expect-error TS2339
|
|
11759
11795
|
const image_token_index = this.config.image_token_index;
|
|
11760
11796
|
|
|
11761
11797
|
const idsList = input_ids.tolist();
|
|
@@ -12741,6 +12777,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
12741
12777
|
const image_nums = vision_tokens.filter(x => x == image_token_id).length;
|
|
12742
12778
|
const video_nums = vision_tokens.filter(x => x == video_token_id).length;
|
|
12743
12779
|
|
|
12780
|
+
/** @type {number[][]} */
|
|
12744
12781
|
let llm_pos_ids_list = [];
|
|
12745
12782
|
let st = 0;
|
|
12746
12783
|
let remain_images = image_nums;
|
|
@@ -12810,6 +12847,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
12810
12847
|
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
|
|
12811
12848
|
// meaning to perform concatenation along dim=1, we can do the following:
|
|
12812
12849
|
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
12850
|
+
/** @type {number[]} */
|
|
12813
12851
|
const llm_positions = new Array(num_items);
|
|
12814
12852
|
let index = 0;
|
|
12815
12853
|
for (let x = 0; x < 3; ++x) {
|
|
@@ -12850,9 +12888,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
12850
12888
|
{ length: 3 * data.length },
|
|
12851
12889
|
(_, i) => data[i % data.length]
|
|
12852
12890
|
);
|
|
12891
|
+
/** @type {bigint[]} */
|
|
12853
12892
|
const mrope_position_deltas = Array.from(
|
|
12854
12893
|
{ length: dims[0] },
|
|
12855
|
-
(_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] +
|
|
12894
|
+
(_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
12856
12895
|
);
|
|
12857
12896
|
|
|
12858
12897
|
return [
|
|
@@ -13423,7 +13462,7 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
13423
13462
|
*
|
|
13424
13463
|
* **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
|
|
13425
13464
|
* ```javascript
|
|
13426
|
-
* import { DPTForDepthEstimation, AutoProcessor, RawImage,
|
|
13465
|
+
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
13427
13466
|
*
|
|
13428
13467
|
* // Load model and processor
|
|
13429
13468
|
* const model_id = 'Xenova/dpt-hybrid-midas';
|
|
@@ -13432,7 +13471,7 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
13432
13471
|
*
|
|
13433
13472
|
* // Load image from URL
|
|
13434
13473
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
13435
|
-
* const image = await RawImage.
|
|
13474
|
+
* const image = await RawImage.read(url);
|
|
13436
13475
|
*
|
|
13437
13476
|
* // Prepare image for the model
|
|
13438
13477
|
* const inputs = await processor(image);
|
|
@@ -13441,10 +13480,15 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
13441
13480
|
* const { predicted_depth } = await model(inputs);
|
|
13442
13481
|
*
|
|
13443
13482
|
* // Interpolate to original size
|
|
13444
|
-
* const prediction =
|
|
13483
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
13484
|
+
* size: image.size.reverse(),
|
|
13485
|
+
* mode: 'bilinear',
|
|
13486
|
+
* })).squeeze(1);
|
|
13445
13487
|
*
|
|
13446
13488
|
* // Visualize the prediction
|
|
13447
|
-
* const
|
|
13489
|
+
* const min = prediction.min().item();
|
|
13490
|
+
* const max = prediction.max().item();
|
|
13491
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
13448
13492
|
* const depth = RawImage.fromTensor(formatted);
|
|
13449
13493
|
* // RawImage {
|
|
13450
13494
|
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
@@ -13494,11 +13538,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
|
|
|
13494
13538
|
class GLPNModel extends GLPNPreTrainedModel { }
|
|
13495
13539
|
|
|
13496
13540
|
/**
|
|
13497
|
-
*
|
|
13498
|
-
*
|
|
13499
|
-
* **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
|
|
13500
|
-
* ```javascript
|
|
13501
|
-
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
|
|
13541
|
+
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
13502
13542
|
*
|
|
13503
13543
|
* // Load model and processor
|
|
13504
13544
|
* const model_id = 'Xenova/glpn-kitti';
|
|
@@ -13507,7 +13547,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
13507
13547
|
*
|
|
13508
13548
|
* // Load image from URL
|
|
13509
13549
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
13510
|
-
* const image = await RawImage.
|
|
13550
|
+
* const image = await RawImage.read(url);
|
|
13511
13551
|
*
|
|
13512
13552
|
* // Prepare image for the model
|
|
13513
13553
|
* const inputs = await processor(image);
|
|
@@ -13516,13 +13556,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
13516
13556
|
* const { predicted_depth } = await model(inputs);
|
|
13517
13557
|
*
|
|
13518
13558
|
* // Interpolate to original size
|
|
13519
|
-
* const prediction =
|
|
13559
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
13560
|
+
* size: image.size.reverse(),
|
|
13561
|
+
* mode: 'bilinear',
|
|
13562
|
+
* })).squeeze(1);
|
|
13520
13563
|
*
|
|
13521
13564
|
* // Visualize the prediction
|
|
13522
|
-
* const
|
|
13565
|
+
* const min = prediction.min().item();
|
|
13566
|
+
* const max = prediction.max().item();
|
|
13567
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
13523
13568
|
* const depth = RawImage.fromTensor(formatted);
|
|
13524
13569
|
* // RawImage {
|
|
13525
|
-
* // data: Uint8Array(307200) [
|
|
13570
|
+
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
13526
13571
|
* // width: 640,
|
|
13527
13572
|
* // height: 480,
|
|
13528
13573
|
* // channels: 1
|
|
@@ -14489,10 +14534,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
|
|
|
14489
14534
|
|
|
14490
14535
|
const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
|
|
14491
14536
|
|
|
14537
|
+
// @ts-expect-error TS2339
|
|
14492
14538
|
const r = encoder_outputs.dims[1] / this.config.reduction_factor;
|
|
14493
14539
|
const maxlen = Math.floor(r * maxlenratio);
|
|
14494
14540
|
const minlen = Math.floor(r * minlenratio);
|
|
14495
14541
|
|
|
14542
|
+
// @ts-expect-error TS2339
|
|
14496
14543
|
const num_mel_bins = this.config.num_mel_bins;
|
|
14497
14544
|
|
|
14498
14545
|
let spectrogramParts = [];
|
|
@@ -14857,11 +14904,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
|
|
|
14857
14904
|
*/
|
|
14858
14905
|
_apply_and_filter_by_delay_pattern_mask(outputs) {
|
|
14859
14906
|
const [bs_x_codebooks, seqLength] = outputs.dims;
|
|
14907
|
+
// @ts-expect-error TS2339
|
|
14860
14908
|
const num_codebooks = this.config.decoder.num_codebooks;
|
|
14861
14909
|
const upperBound = (seqLength - num_codebooks);
|
|
14862
14910
|
|
|
14863
14911
|
let newDataSize = 0;
|
|
14864
14912
|
for (let i = 0; i < outputs.size; ++i) {
|
|
14913
|
+
// @ts-expect-error TS2339
|
|
14865
14914
|
if (outputs.data[i] === this.config.decoder.pad_token_id) {
|
|
14866
14915
|
continue;
|
|
14867
14916
|
}
|
|
@@ -14891,7 +14940,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
|
|
|
14891
14940
|
let clonedInputIds = structuredClone(input_ids);
|
|
14892
14941
|
for (let i = 0; i < clonedInputIds.length; ++i) {
|
|
14893
14942
|
for (let j = 0; j < clonedInputIds[i].length; ++j) {
|
|
14943
|
+
// @ts-expect-error TS2339
|
|
14894
14944
|
if ((i % this.config.decoder.num_codebooks) >= j) {
|
|
14945
|
+
// @ts-expect-error TS2339
|
|
14895
14946
|
clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
|
|
14896
14947
|
}
|
|
14897
14948
|
}
|
|
@@ -15048,6 +15099,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
|
|
|
15048
15099
|
'past_key_values',
|
|
15049
15100
|
];
|
|
15050
15101
|
|
|
15102
|
+
/**
|
|
15103
|
+
* @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
|
|
15104
|
+
*/
|
|
15051
15105
|
constructor(...args) {
|
|
15052
15106
|
super(...args);
|
|
15053
15107
|
|
|
@@ -16016,10 +16070,17 @@ class SequenceClassifierOutput extends ModelOutput {
|
|
|
16016
16070
|
/**
|
|
16017
16071
|
* @param {Object} output The output of the model.
|
|
16018
16072
|
* @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
|
|
16073
|
+
* @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
|
|
16074
|
+
* Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
|
16019
16075
|
*/
|
|
16020
|
-
constructor({ logits }) {
|
|
16076
|
+
constructor({ logits, ...attentions }) {
|
|
16021
16077
|
super();
|
|
16022
16078
|
this.logits = logits;
|
|
16079
|
+
const attentions_list = Object.values(attentions);
|
|
16080
|
+
if (attentions_list.length > 0) {
|
|
16081
|
+
// Only set attentions if they are not empty
|
|
16082
|
+
this.attentions = attentions_list;
|
|
16083
|
+
}
|
|
16023
16084
|
}
|
|
16024
16085
|
}
|
|
16025
16086
|
|
|
@@ -16275,22 +16336,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
16275
16336
|
|
|
16276
16337
|
class AutoFeatureExtractor {
|
|
16277
16338
|
|
|
16278
|
-
/**
|
|
16279
|
-
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
16280
|
-
*
|
|
16281
|
-
* The processor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
16282
|
-
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
16283
|
-
*
|
|
16284
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
16285
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
16286
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
16287
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
16288
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
16289
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
16290
|
-
*
|
|
16291
|
-
* @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
|
|
16292
|
-
*/
|
|
16293
|
-
|
|
16294
16339
|
/** @type {typeof FeatureExtractor.from_pretrained} */
|
|
16295
16340
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
16296
16341
|
|
|
@@ -16417,22 +16462,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
16417
16462
|
*/
|
|
16418
16463
|
class AutoProcessor {
|
|
16419
16464
|
|
|
16420
|
-
/**
|
|
16421
|
-
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
16422
|
-
*
|
|
16423
|
-
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
16424
|
-
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
16425
|
-
*
|
|
16426
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
16427
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
16428
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
16429
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
16430
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
16431
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
16432
|
-
*
|
|
16433
|
-
* @returns {Promise<Processor>} A new instance of the Processor class.
|
|
16434
|
-
*/
|
|
16435
|
-
|
|
16436
16465
|
/** @type {typeof Processor.from_pretrained} */
|
|
16437
16466
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
16438
16467
|
|
|
@@ -16750,6 +16779,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
|
|
|
16750
16779
|
/**
|
|
16751
16780
|
* Percentage of the image to crop. Only has an effect if this.size < 384.
|
|
16752
16781
|
*/
|
|
16782
|
+
// @ts-expect-error TS2339
|
|
16753
16783
|
this.crop_pct = this.config.crop_pct ?? (224 / 256);
|
|
16754
16784
|
}
|
|
16755
16785
|
|
|
@@ -16952,6 +16982,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
16952
16982
|
class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
|
|
16953
16983
|
constructor(config) {
|
|
16954
16984
|
super(config);
|
|
16985
|
+
// @ts-expect-error TS2339
|
|
16955
16986
|
this.include_top = this.config.include_top ?? true;
|
|
16956
16987
|
if (this.include_top) {
|
|
16957
16988
|
this.image_std = this.image_std.map(x => x * x);
|
|
@@ -17033,8 +17064,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
17033
17064
|
super(config, components);
|
|
17034
17065
|
|
|
17035
17066
|
const {
|
|
17067
|
+
// @ts-expect-error TS2339
|
|
17036
17068
|
tasks_answer_post_processing_type,
|
|
17069
|
+
// @ts-expect-error TS2339
|
|
17037
17070
|
task_prompts_without_inputs,
|
|
17071
|
+
// @ts-expect-error TS2339
|
|
17038
17072
|
task_prompts_with_input,
|
|
17039
17073
|
} = this.image_processor.config;
|
|
17040
17074
|
|
|
@@ -17329,6 +17363,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
|
|
|
17329
17363
|
|
|
17330
17364
|
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
17331
17365
|
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
17366
|
+
|
|
17367
|
+
// @ts-expect-error
|
|
17332
17368
|
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
17333
17369
|
}
|
|
17334
17370
|
}
|
|
@@ -17735,6 +17771,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
|
|
|
17735
17771
|
},
|
|
17736
17772
|
...config,
|
|
17737
17773
|
});
|
|
17774
|
+
// @ts-expect-error TS2339
|
|
17738
17775
|
this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
|
|
17739
17776
|
}
|
|
17740
17777
|
|
|
@@ -18176,6 +18213,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
|
|
|
18176
18213
|
* - bpe_preds: The list of BPE decoded sentences.
|
|
18177
18214
|
* - wp_preds: The list of wp decoded sentences.
|
|
18178
18215
|
*/
|
|
18216
|
+
// @ts-expect-error The type of this method is not compatible with the one
|
|
18217
|
+
// in the base class. It might be a good idea to fix this.
|
|
18179
18218
|
batch_decode([char_logits, bpe_logits, wp_logits]) {
|
|
18180
18219
|
const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
|
|
18181
18220
|
const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
|
|
@@ -18557,6 +18596,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
18557
18596
|
}
|
|
18558
18597
|
|
|
18559
18598
|
const bos_token = this.tokenizer.bos_token;
|
|
18599
|
+
// @ts-expect-error TS2339
|
|
18560
18600
|
const image_seq_length = this.image_processor.config.image_seq_length;
|
|
18561
18601
|
let input_strings;
|
|
18562
18602
|
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
|
|
@@ -18807,7 +18847,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
|
|
|
18807
18847
|
*
|
|
18808
18848
|
* @param {string|string[]} text
|
|
18809
18849
|
* @param {RawImage|RawImage[]} images
|
|
18810
|
-
* @param {
|
|
18850
|
+
* @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
|
|
18811
18851
|
* @returns {Promise<any>}
|
|
18812
18852
|
*/
|
|
18813
18853
|
async _call(text, images = null, {
|
|
@@ -18991,6 +19031,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
|
|
|
18991
19031
|
|
|
18992
19032
|
let current_speaker = -1;
|
|
18993
19033
|
for (let i = 0; i < scores.length; ++i) {
|
|
19034
|
+
/** @type {number[]} */
|
|
18994
19035
|
const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
|
|
18995
19036
|
const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
|
|
18996
19037
|
const [start, end] = [i, i + 1];
|
|
@@ -19175,6 +19216,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
|
|
|
19175
19216
|
}
|
|
19176
19217
|
|
|
19177
19218
|
if (image_grid_thw) {
|
|
19219
|
+
// @ts-expect-error TS2551
|
|
19178
19220
|
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
19179
19221
|
let index = 0;
|
|
19180
19222
|
|
|
@@ -19662,8 +19704,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
|
|
|
19662
19704
|
'int64',
|
|
19663
19705
|
new BigInt64Array(numPaddedFrames),
|
|
19664
19706
|
[1, numPaddedFrames],
|
|
19665
|
-
)
|
|
19666
|
-
padded_attention_mask.data.fill(1n, 0, num_frames);
|
|
19707
|
+
);
|
|
19708
|
+
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
|
|
19667
19709
|
}
|
|
19668
19710
|
}
|
|
19669
19711
|
}
|
|
@@ -20463,7 +20505,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
|
|
|
20463
20505
|
)
|
|
20464
20506
|
|
|
20465
20507
|
const data = features.data;
|
|
20466
|
-
const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
|
|
20508
|
+
const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
|
|
20467
20509
|
|
|
20468
20510
|
for (let i = 0; i < data.length; ++i) {
|
|
20469
20511
|
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|
|
@@ -20722,6 +20764,16 @@ class TensorOpRegistry {
|
|
|
20722
20764
|
// executionProviders: ['webgpu'],
|
|
20723
20765
|
};
|
|
20724
20766
|
|
|
20767
|
+
static get nearest_interpolate_4d() {
|
|
20768
|
+
if (!this._nearest_interpolate_4d) {
|
|
20769
|
+
this._nearest_interpolate_4d = wrap(
|
|
20770
|
+
[8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
|
|
20771
|
+
this.session_options,
|
|
20772
|
+
'y',
|
|
20773
|
+
);
|
|
20774
|
+
}
|
|
20775
|
+
return this._nearest_interpolate_4d;
|
|
20776
|
+
}
|
|
20725
20777
|
static get bilinear_interpolate_4d() {
|
|
20726
20778
|
if (!this._bilinear_interpolate_4d) {
|
|
20727
20779
|
this._bilinear_interpolate_4d = wrap(
|
|
@@ -21095,6 +21147,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
|
|
|
21095
21147
|
|
|
21096
21148
|
// TODO: Use softmax tensor function
|
|
21097
21149
|
const function_to_apply =
|
|
21150
|
+
// @ts-expect-error TS2339
|
|
21098
21151
|
this.model.config.problem_type === 'multi_label_classification'
|
|
21099
21152
|
? batch => batch.sigmoid()
|
|
21100
21153
|
: batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
|
|
@@ -21103,6 +21156,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
|
|
|
21103
21156
|
batch.dims,
|
|
21104
21157
|
); // single_label_classification (default)
|
|
21105
21158
|
|
|
21159
|
+
// @ts-expect-error TS2339
|
|
21106
21160
|
const id2label = this.model.config.id2label;
|
|
21107
21161
|
|
|
21108
21162
|
const toReturn = [];
|
|
@@ -21205,6 +21259,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
|
|
|
21205
21259
|
const outputs = await this.model(model_inputs)
|
|
21206
21260
|
|
|
21207
21261
|
const logits = outputs.logits;
|
|
21262
|
+
// @ts-expect-error TS2339
|
|
21208
21263
|
const id2label = this.model.config.id2label;
|
|
21209
21264
|
|
|
21210
21265
|
const toReturn = [];
|
|
@@ -21544,11 +21599,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
|
|
|
21544
21599
|
|
|
21545
21600
|
|
|
21546
21601
|
// Add global prefix, if present
|
|
21602
|
+
// @ts-expect-error TS2339
|
|
21547
21603
|
if (this.model.config.prefix) {
|
|
21604
|
+
// @ts-expect-error TS2339
|
|
21548
21605
|
texts = texts.map(x => this.model.config.prefix + x)
|
|
21549
21606
|
}
|
|
21550
21607
|
|
|
21551
21608
|
// Handle task specific params:
|
|
21609
|
+
// @ts-expect-error TS2339
|
|
21552
21610
|
const task_specific_params = this.model.config.task_specific_params
|
|
21553
21611
|
if (task_specific_params && task_specific_params[this.task]) {
|
|
21554
21612
|
// Add prefixes, if present
|
|
@@ -22287,6 +22345,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
|
|
|
22287
22345
|
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
22288
22346
|
const preparedAudios = await prepareAudios(audio, sampling_rate);
|
|
22289
22347
|
|
|
22348
|
+
// @ts-expect-error TS2339
|
|
22290
22349
|
const id2label = this.model.config.id2label;
|
|
22291
22350
|
|
|
22292
22351
|
const toReturn = [];
|
|
@@ -22597,6 +22656,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22597
22656
|
audio = [/** @type {AudioInput} */ (audio)];
|
|
22598
22657
|
}
|
|
22599
22658
|
|
|
22659
|
+
// @ts-expect-error TS2339
|
|
22600
22660
|
const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
|
|
22601
22661
|
const hop_length = this.processor.feature_extractor.config.hop_length;
|
|
22602
22662
|
|
|
@@ -22662,7 +22722,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22662
22722
|
|
|
22663
22723
|
// TODO: Right now we only get top beam
|
|
22664
22724
|
if (return_timestamps === 'word') {
|
|
22725
|
+
// @ts-expect-error TS2339
|
|
22665
22726
|
chunk.tokens = data.sequences.tolist()[0];
|
|
22727
|
+
// @ts-expect-error TS2339
|
|
22666
22728
|
chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
|
|
22667
22729
|
(/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
|
|
22668
22730
|
);
|
|
@@ -22707,7 +22769,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22707
22769
|
const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
|
|
22708
22770
|
const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
|
|
22709
22771
|
|
|
22710
|
-
const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
|
|
22772
|
+
const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
|
|
22711
22773
|
toReturn.push({ text });
|
|
22712
22774
|
}
|
|
22713
22775
|
return single ? toReturn[0] : toReturn;
|
|
@@ -22856,6 +22918,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
|
|
|
22856
22918
|
const { pixel_values } = await this.processor(preparedImages);
|
|
22857
22919
|
const output = await this.model({ pixel_values });
|
|
22858
22920
|
|
|
22921
|
+
// @ts-expect-error TS2339
|
|
22859
22922
|
const id2label = this.model.config.id2label;
|
|
22860
22923
|
|
|
22861
22924
|
/** @type {ImageClassificationOutput[]} */
|
|
@@ -22970,6 +23033,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
|
|
|
22970
23033
|
}
|
|
22971
23034
|
}
|
|
22972
23035
|
|
|
23036
|
+
// @ts-expect-error TS2339
|
|
22973
23037
|
const id2label = this.model.config.id2label;
|
|
22974
23038
|
|
|
22975
23039
|
/** @type {ImageSegmentationPipelineOutput[]} */
|
|
@@ -23196,6 +23260,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
|
|
|
23196
23260
|
const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
|
|
23197
23261
|
|
|
23198
23262
|
// Add labels
|
|
23263
|
+
// @ts-expect-error TS2339
|
|
23199
23264
|
const id2label = this.model.config.id2label;
|
|
23200
23265
|
|
|
23201
23266
|
// Format output
|
|
@@ -23415,6 +23480,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
|
|
|
23415
23480
|
// Run model
|
|
23416
23481
|
const output = await this.model.generate({
|
|
23417
23482
|
inputs: pixel_values,
|
|
23483
|
+
// @ts-expect-error TS2339
|
|
23418
23484
|
max_length: this.model.config.decoder.max_position_embeddings,
|
|
23419
23485
|
decoder_input_ids,
|
|
23420
23486
|
...generate_kwargs,
|
|
@@ -23530,6 +23596,7 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
|
|
|
23530
23596
|
// Generate waveform
|
|
23531
23597
|
const { waveform } = await this.model(inputs);
|
|
23532
23598
|
|
|
23599
|
+
// @ts-expect-error TS2339
|
|
23533
23600
|
const sampling_rate = this.model.config.sampling_rate;
|
|
23534
23601
|
return {
|
|
23535
23602
|
audio: waveform.data,
|
|
@@ -23687,11 +23754,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
|
|
|
23687
23754
|
|
|
23688
23755
|
const toReturn = [];
|
|
23689
23756
|
for (let i = 0; i < preparedImages.length; ++i) {
|
|
23690
|
-
const
|
|
23691
|
-
const
|
|
23757
|
+
const batch = predicted_depth[i];
|
|
23758
|
+
const [height, width] = batch.dims.slice(-2);
|
|
23759
|
+
const [new_width, new_height] = preparedImages[i].size;
|
|
23760
|
+
|
|
23761
|
+
// Interpolate to original size
|
|
23762
|
+
const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
|
|
23763
|
+
size: [new_height, new_width],
|
|
23764
|
+
mode: 'bilinear',
|
|
23765
|
+
})).view(new_height, new_width);
|
|
23766
|
+
|
|
23767
|
+
const minval = /** @type {number} */(prediction.min().item());
|
|
23768
|
+
const maxval = /** @type {number} */(prediction.max().item());
|
|
23769
|
+
const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
|
|
23770
|
+
const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
|
|
23692
23771
|
toReturn.push({
|
|
23693
|
-
predicted_depth:
|
|
23694
|
-
depth
|
|
23772
|
+
predicted_depth: prediction,
|
|
23773
|
+
depth,
|
|
23695
23774
|
});
|
|
23696
23775
|
}
|
|
23697
23776
|
|
|
@@ -24171,6 +24250,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
|
|
|
24171
24250
|
return result;
|
|
24172
24251
|
}
|
|
24173
24252
|
|
|
24253
|
+
|
|
24174
24254
|
/***/ }),
|
|
24175
24255
|
|
|
24176
24256
|
/***/ "./src/tokenizers.js":
|
|
@@ -24239,7 +24319,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
24239
24319
|
/* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
|
|
24240
24320
|
/* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
|
|
24241
24321
|
/* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
|
|
24242
|
-
/* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
|
|
24243
24322
|
|
|
24244
24323
|
/**
|
|
24245
24324
|
* @file Tokenizers are used to prepare textual inputs for a model.
|
|
@@ -24276,7 +24355,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
24276
24355
|
|
|
24277
24356
|
|
|
24278
24357
|
|
|
24279
|
-
|
|
24280
24358
|
/**
|
|
24281
24359
|
* @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
|
|
24282
24360
|
* @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
|
|
@@ -24760,7 +24838,7 @@ class Unigram extends TokenizerModel {
|
|
|
24760
24838
|
* Create a new Unigram tokenizer model.
|
|
24761
24839
|
* @param {Object} config The configuration object for the Unigram model.
|
|
24762
24840
|
* @param {number} config.unk_id The ID of the unknown token
|
|
24763
|
-
* @param {
|
|
24841
|
+
* @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
|
|
24764
24842
|
* @param {Object} moreConfig Additional configuration object for the Unigram model.
|
|
24765
24843
|
*/
|
|
24766
24844
|
constructor(config, moreConfig) {
|
|
@@ -24768,11 +24846,10 @@ class Unigram extends TokenizerModel {
|
|
|
24768
24846
|
|
|
24769
24847
|
const vocabSize = config.vocab.length;
|
|
24770
24848
|
this.vocab = new Array(vocabSize);
|
|
24849
|
+
/** @type {number[]} */
|
|
24771
24850
|
this.scores = new Array(vocabSize);
|
|
24772
24851
|
for (let i = 0; i < vocabSize; ++i) {
|
|
24773
|
-
|
|
24774
|
-
this.vocab[i] = piece[0];
|
|
24775
|
-
this.scores[i] = piece[1];
|
|
24852
|
+
[this.vocab[i], this.scores[i]] = config.vocab[i];
|
|
24776
24853
|
}
|
|
24777
24854
|
|
|
24778
24855
|
this.unk_token_id = config.unk_id;
|
|
@@ -30129,6 +30206,8 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
30129
30206
|
/* harmony export */ });
|
|
30130
30207
|
/* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
|
|
30131
30208
|
/* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
|
|
30209
|
+
/// <reference types="@webgpu/types" />
|
|
30210
|
+
|
|
30132
30211
|
|
|
30133
30212
|
|
|
30134
30213
|
|
|
@@ -30382,7 +30461,7 @@ class FileResponse {
|
|
|
30382
30461
|
*/
|
|
30383
30462
|
async arrayBuffer() {
|
|
30384
30463
|
const data = await fs__WEBPACK_IMPORTED_MODULE_0__["default"].promises.readFile(this.filePath);
|
|
30385
|
-
return data.buffer;
|
|
30464
|
+
return /** @type {ArrayBuffer} */ (data.buffer);
|
|
30386
30465
|
}
|
|
30387
30466
|
|
|
30388
30467
|
/**
|
|
@@ -32041,8 +32120,9 @@ function magnitude(arr) {
|
|
|
32041
32120
|
|
|
32042
32121
|
/**
|
|
32043
32122
|
* Returns the value and index of the minimum element in an array.
|
|
32044
|
-
* @
|
|
32045
|
-
* @
|
|
32123
|
+
* @template {number[]|bigint[]|AnyTypedArray} T
|
|
32124
|
+
* @param {T} arr array of numbers.
|
|
32125
|
+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
|
|
32046
32126
|
* @throws {Error} If array is empty.
|
|
32047
32127
|
*/
|
|
32048
32128
|
function min(arr) {
|
|
@@ -32055,14 +32135,15 @@ function min(arr) {
|
|
|
32055
32135
|
indexOfMin = i;
|
|
32056
32136
|
}
|
|
32057
32137
|
}
|
|
32058
|
-
return [min, indexOfMin];
|
|
32138
|
+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
|
|
32059
32139
|
}
|
|
32060
32140
|
|
|
32061
32141
|
|
|
32062
32142
|
/**
|
|
32063
32143
|
* Returns the value and index of the maximum element in an array.
|
|
32064
|
-
* @
|
|
32065
|
-
* @
|
|
32144
|
+
* @template {number[]|bigint[]|AnyTypedArray} T
|
|
32145
|
+
* @param {T} arr array of numbers.
|
|
32146
|
+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
|
|
32066
32147
|
* @throws {Error} If array is empty.
|
|
32067
32148
|
*/
|
|
32068
32149
|
function max(arr) {
|
|
@@ -32075,7 +32156,7 @@ function max(arr) {
|
|
|
32075
32156
|
indexOfMax = i;
|
|
32076
32157
|
}
|
|
32077
32158
|
}
|
|
32078
|
-
return [
|
|
32159
|
+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
|
|
32079
32160
|
}
|
|
32080
32161
|
|
|
32081
32162
|
function isPowerOfTwo(number) {
|
|
@@ -33372,8 +33453,6 @@ class Tensor {
|
|
|
33372
33453
|
return this.permute(...dims);
|
|
33373
33454
|
}
|
|
33374
33455
|
|
|
33375
|
-
// TODO add .max() and .min() methods
|
|
33376
|
-
|
|
33377
33456
|
/**
|
|
33378
33457
|
* Returns the sum of each row of the input tensor in the given dimension dim.
|
|
33379
33458
|
*
|
|
@@ -33667,6 +33746,36 @@ class Tensor {
|
|
|
33667
33746
|
return mean(this, dim, keepdim);
|
|
33668
33747
|
}
|
|
33669
33748
|
|
|
33749
|
+
min(dim = null, keepdim = false) {
|
|
33750
|
+
if (dim !== null) {
|
|
33751
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33752
|
+
}
|
|
33753
|
+
const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
|
|
33754
|
+
return new Tensor(this.type, [value], []);
|
|
33755
|
+
}
|
|
33756
|
+
max(dim = null, keepdim = false) {
|
|
33757
|
+
if (dim !== null) {
|
|
33758
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33759
|
+
}
|
|
33760
|
+
const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
|
|
33761
|
+
return new Tensor(this.type, [value], []);
|
|
33762
|
+
}
|
|
33763
|
+
|
|
33764
|
+
argmin(dim = null, keepdim = false) {
|
|
33765
|
+
if (dim !== null) {
|
|
33766
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33767
|
+
}
|
|
33768
|
+
const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
|
|
33769
|
+
return new Tensor('int64', [BigInt(index)], []);
|
|
33770
|
+
}
|
|
33771
|
+
argmax(dim = null, keepdim = false) {
|
|
33772
|
+
if (dim !== null) {
|
|
33773
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33774
|
+
}
|
|
33775
|
+
const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
|
|
33776
|
+
return new Tensor('int64', [BigInt(index)], []);
|
|
33777
|
+
}
|
|
33778
|
+
|
|
33670
33779
|
/**
|
|
33671
33780
|
* Performs Tensor dtype conversion.
|
|
33672
33781
|
* @param {DataType} type The desired data type.
|
|
@@ -33800,7 +33909,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
|
|
|
33800
33909
|
* @param {Tensor} input the input tensor
|
|
33801
33910
|
* @param {Object} options the options for the interpolation
|
|
33802
33911
|
* @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
|
|
33803
|
-
* @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
|
|
33912
|
+
* @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
|
|
33804
33913
|
* @returns {Promise<Tensor>} The interpolated tensor.
|
|
33805
33914
|
*/
|
|
33806
33915
|
async function interpolate_4d(input, {
|
|
@@ -33830,7 +33939,9 @@ async function interpolate_4d(input, {
|
|
|
33830
33939
|
}
|
|
33831
33940
|
|
|
33832
33941
|
let op;
|
|
33833
|
-
if (mode === '
|
|
33942
|
+
if (mode === 'nearest') {
|
|
33943
|
+
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
|
|
33944
|
+
} else if (mode === 'bilinear') {
|
|
33834
33945
|
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
|
|
33835
33946
|
} else if (mode === 'bicubic') {
|
|
33836
33947
|
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
|
|
@@ -33871,13 +33982,13 @@ async function rfft(x, a) {
|
|
|
33871
33982
|
* Returns the k largest elements of the given input tensor.
|
|
33872
33983
|
* Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
|
|
33873
33984
|
* @param {Tensor} x the input tensor
|
|
33874
|
-
* @param {number} k the k in "top-k"
|
|
33985
|
+
* @param {number} [k] the k in "top-k"
|
|
33875
33986
|
* @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
|
|
33876
33987
|
*/
|
|
33877
33988
|
async function topk(x, k) {
|
|
33878
33989
|
const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
|
|
33879
33990
|
|
|
33880
|
-
if (k
|
|
33991
|
+
if (k == null) {
|
|
33881
33992
|
k = x.dims.at(-1);
|
|
33882
33993
|
} else {
|
|
33883
33994
|
k = Math.min(k, x.dims.at(-1));
|
|
@@ -33906,10 +34017,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
|
|
|
33906
34017
|
async function slice(data, starts, ends, axes, steps) {
|
|
33907
34018
|
const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
|
|
33908
34019
|
return await op({
|
|
33909
|
-
x: data,
|
|
33910
|
-
s: arrayToIndexTensor(starts),
|
|
33911
|
-
e: arrayToIndexTensor(ends),
|
|
33912
|
-
a: arrayToIndexTensor(axes),
|
|
34020
|
+
x: data,
|
|
34021
|
+
s: arrayToIndexTensor(starts),
|
|
34022
|
+
e: arrayToIndexTensor(ends),
|
|
34023
|
+
a: arrayToIndexTensor(axes),
|
|
33913
34024
|
t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
|
|
33914
34025
|
});
|
|
33915
34026
|
}
|