@huggingface/transformers 3.2.3 → 3.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/transformers.cjs +203 -92
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +203 -92
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +203 -92
- package/dist/transformers.mjs.map +1 -1
- package/package.json +2 -2
- package/src/base/feature_extraction_utils.js +9 -9
- package/src/base/image_processors_utils.js +11 -0
- package/src/base/processing_utils.js +13 -3
- package/src/configs.js +5 -0
- package/src/env.js +1 -1
- package/src/models/auto/feature_extraction_auto.js +0 -16
- package/src/models/auto/processing_auto.js +0 -16
- package/src/models/convnext/image_processing_convnext.js +1 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
- package/src/models/florence2/processing_florence2.js +3 -0
- package/src/models/idefics3/image_processing_idefics3.js +2 -0
- package/src/models/janus/image_processing_janus.js +1 -0
- package/src/models/mgp_str/processing_mgp_str.js +2 -0
- package/src/models/paligemma/processing_paligemma.js +1 -0
- package/src/models/phi3_v/processing_phi3_v.js +1 -1
- package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
- package/src/models/whisper/feature_extraction_whisper.js +1 -1
- package/src/models.js +50 -15
- package/src/ops/registry.js +10 -0
- package/src/pipelines.js +34 -7
- package/src/tokenizers.js +4 -7
- package/src/utils/dtypes.js +2 -0
- package/src/utils/hub.js +1 -1
- package/src/utils/maths.js +8 -6
- package/src/utils/tensor.js +42 -10
- package/types/base/feature_extraction_utils.d.ts +7 -7
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +17 -19
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
- package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
- package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
- package/types/models/whisper/generation_whisper.d.ts +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models.d.ts +32 -17
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +2 -2
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/tsconfig.tsbuildinfo +1 -0
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +3 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +8 -6
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +8 -4
- package/types/utils/tensor.d.ts.map +1 -1
package/dist/transformers.js
CHANGED
|
@@ -6927,23 +6927,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
|
|
|
6927
6927
|
}
|
|
6928
6928
|
|
|
6929
6929
|
/**
|
|
6930
|
-
* Instantiate one of the
|
|
6930
|
+
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
6931
6931
|
*
|
|
6932
|
-
* The
|
|
6933
|
-
*
|
|
6932
|
+
* The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
6933
|
+
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
6934
6934
|
*
|
|
6935
6935
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
6936
|
-
* - A string, the *model id* of a pretrained
|
|
6936
|
+
* - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
|
|
6937
6937
|
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
6938
6938
|
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
6939
|
-
* - A path to a *directory* containing
|
|
6940
|
-
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the
|
|
6939
|
+
* - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
|
|
6940
|
+
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
|
|
6941
6941
|
*
|
|
6942
|
-
* @returns {Promise<FeatureExtractor>} A new instance of the
|
|
6942
|
+
* @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
|
|
6943
6943
|
*/
|
|
6944
6944
|
static async from_pretrained(pretrained_model_name_or_path, options) {
|
|
6945
|
-
const
|
|
6946
|
-
return new this(
|
|
6945
|
+
const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
|
|
6946
|
+
return new this(config);
|
|
6947
6947
|
}
|
|
6948
6948
|
}
|
|
6949
6949
|
|
|
@@ -7593,14 +7593,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
7593
7593
|
this.do_thumbnail = config.do_thumbnail;
|
|
7594
7594
|
this.size = config.size ?? config.image_size;
|
|
7595
7595
|
this.do_resize = config.do_resize ?? (this.size !== undefined);
|
|
7596
|
+
// @ts-expect-error TS2339
|
|
7596
7597
|
this.size_divisibility = config.size_divisibility ?? config.size_divisor;
|
|
7597
7598
|
|
|
7598
7599
|
this.do_center_crop = config.do_center_crop;
|
|
7600
|
+
// @ts-expect-error TS2339
|
|
7599
7601
|
this.crop_size = config.crop_size;
|
|
7602
|
+
// @ts-expect-error TS2339
|
|
7600
7603
|
this.do_convert_rgb = config.do_convert_rgb ?? true;
|
|
7604
|
+
// @ts-expect-error TS2339
|
|
7601
7605
|
this.do_crop_margin = config.do_crop_margin;
|
|
7602
7606
|
|
|
7607
|
+
// @ts-expect-error TS2339
|
|
7603
7608
|
this.pad_size = config.pad_size;
|
|
7609
|
+
// @ts-expect-error TS2339
|
|
7604
7610
|
this.do_pad = config.do_pad;
|
|
7605
7611
|
|
|
7606
7612
|
if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
|
|
@@ -7809,6 +7815,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
7809
7815
|
// Support both formats for backwards compatibility
|
|
7810
7816
|
else if (Number.isInteger(size)) {
|
|
7811
7817
|
shortest_edge = size;
|
|
7818
|
+
// @ts-expect-error TS2339
|
|
7812
7819
|
longest_edge = this.config.max_size ?? shortest_edge;
|
|
7813
7820
|
|
|
7814
7821
|
} else if (size !== undefined) {
|
|
@@ -7877,6 +7884,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
7877
7884
|
} else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
|
|
7878
7885
|
// Custom resize logic for Qwen2-VL models
|
|
7879
7886
|
const { min_pixels, max_pixels } = size;
|
|
7887
|
+
// @ts-expect-error TS2339
|
|
7880
7888
|
const factor = this.config.patch_size * this.config.merge_size;
|
|
7881
7889
|
return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
|
|
7882
7890
|
} else {
|
|
@@ -7892,6 +7900,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
7892
7900
|
async resize(image) {
|
|
7893
7901
|
const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
|
|
7894
7902
|
return await image.resize(newWidth, newHeight, {
|
|
7903
|
+
// @ts-expect-error TS2322
|
|
7895
7904
|
resample: this.resample,
|
|
7896
7905
|
});
|
|
7897
7906
|
}
|
|
@@ -7942,6 +7951,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
7942
7951
|
|
|
7943
7952
|
// Resize the image using thumbnail method.
|
|
7944
7953
|
if (this.do_thumbnail) {
|
|
7954
|
+
// @ts-expect-error TS2345
|
|
7945
7955
|
image = await this.thumbnail(image, this.size, this.resample);
|
|
7946
7956
|
}
|
|
7947
7957
|
|
|
@@ -7966,6 +7976,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
7966
7976
|
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
|
|
7967
7977
|
// occurs with data in the hwc format (height, width, channels),
|
|
7968
7978
|
// to emulate the behavior of the original Python code (w/ numpy).
|
|
7979
|
+
/** @type {Float32Array} */
|
|
7969
7980
|
let pixelData = Float32Array.from(image.data);
|
|
7970
7981
|
let imgDims = [image.height, image.width, image.channels];
|
|
7971
7982
|
|
|
@@ -8123,6 +8134,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
8123
8134
|
/**
|
|
8124
8135
|
* @typedef {Object} ProcessorProperties Additional processor-specific properties.
|
|
8125
8136
|
* @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
|
|
8137
|
+
* @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
|
|
8126
8138
|
*/
|
|
8127
8139
|
|
|
8128
8140
|
|
|
@@ -8156,7 +8168,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
8156
8168
|
}
|
|
8157
8169
|
|
|
8158
8170
|
/**
|
|
8159
|
-
* @returns {
|
|
8171
|
+
* @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
|
|
8160
8172
|
*/
|
|
8161
8173
|
get tokenizer() {
|
|
8162
8174
|
return this.components.tokenizer;
|
|
@@ -8169,6 +8181,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
8169
8181
|
return this.components.feature_extractor;
|
|
8170
8182
|
}
|
|
8171
8183
|
|
|
8184
|
+
/**
|
|
8185
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
|
|
8186
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
|
|
8187
|
+
* @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
|
|
8188
|
+
*/
|
|
8172
8189
|
apply_chat_template(messages, options = {}) {
|
|
8173
8190
|
if (!this.tokenizer) {
|
|
8174
8191
|
throw new Error('Unable to apply chat template without a tokenizer.');
|
|
@@ -8179,6 +8196,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
8179
8196
|
});
|
|
8180
8197
|
}
|
|
8181
8198
|
|
|
8199
|
+
/**
|
|
8200
|
+
* @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
|
|
8201
|
+
* @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
|
|
8202
|
+
*/
|
|
8182
8203
|
batch_decode(...args) {
|
|
8183
8204
|
if (!this.tokenizer) {
|
|
8184
8205
|
throw new Error('Unable to decode without a tokenizer.');
|
|
@@ -8206,8 +8227,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
8206
8227
|
/**
|
|
8207
8228
|
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
8208
8229
|
*
|
|
8209
|
-
* The processor class to instantiate is selected based on the `
|
|
8210
|
-
* (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
8230
|
+
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
8231
|
+
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
8211
8232
|
*
|
|
8212
8233
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
8213
8234
|
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
@@ -8326,15 +8347,19 @@ function getNormalizedConfig(config) {
|
|
|
8326
8347
|
case 'florence2':
|
|
8327
8348
|
case 'llava_onevision':
|
|
8328
8349
|
case 'idefics3':
|
|
8350
|
+
// @ts-expect-error TS2339
|
|
8329
8351
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
8330
8352
|
break;
|
|
8331
8353
|
case 'moondream1':
|
|
8354
|
+
// @ts-expect-error TS2339
|
|
8332
8355
|
init_normalized_config = getNormalizedConfig(config.phi_config);
|
|
8333
8356
|
break;
|
|
8334
8357
|
case 'musicgen':
|
|
8358
|
+
// @ts-expect-error TS2339
|
|
8335
8359
|
init_normalized_config = getNormalizedConfig(config.decoder);
|
|
8336
8360
|
break;
|
|
8337
8361
|
case 'multi_modality':
|
|
8362
|
+
// @ts-expect-error TS2339
|
|
8338
8363
|
init_normalized_config = getNormalizedConfig(config.language_config);
|
|
8339
8364
|
break;
|
|
8340
8365
|
|
|
@@ -8455,6 +8480,7 @@ function getNormalizedConfig(config) {
|
|
|
8455
8480
|
break;
|
|
8456
8481
|
|
|
8457
8482
|
case 'vision-encoder-decoder':
|
|
8483
|
+
// @ts-expect-error TS2339
|
|
8458
8484
|
const decoderConfig = getNormalizedConfig(config.decoder);
|
|
8459
8485
|
|
|
8460
8486
|
const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
|
|
@@ -8697,7 +8723,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
8697
8723
|
|
|
8698
8724
|
|
|
8699
8725
|
|
|
8700
|
-
const VERSION = '3.2.
|
|
8726
|
+
const VERSION = '3.2.4';
|
|
8701
8727
|
|
|
8702
8728
|
// Check if various APIs are available (depends on environment)
|
|
8703
8729
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -11353,8 +11379,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
11353
11379
|
} else if (session_options.externalData !== undefined) {
|
|
11354
11380
|
externalDataPromises = session_options.externalData.map(async (ext) => {
|
|
11355
11381
|
// if the external data is a string, fetch the file and replace the string with its content
|
|
11382
|
+
// @ts-expect-error TS2339
|
|
11356
11383
|
if (typeof ext.data === "string") {
|
|
11384
|
+
// @ts-expect-error TS2339
|
|
11357
11385
|
const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
|
|
11386
|
+
// @ts-expect-error TS2698
|
|
11358
11387
|
return { ...ext, data: ext_buffer };
|
|
11359
11388
|
}
|
|
11360
11389
|
return ext;
|
|
@@ -12602,6 +12631,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
12602
12631
|
if (this.config.model_type === 'musicgen') {
|
|
12603
12632
|
// Custom logic (TODO: move to Musicgen class)
|
|
12604
12633
|
decoder_input_ids = Array.from({
|
|
12634
|
+
// @ts-expect-error TS2339
|
|
12605
12635
|
length: batch_size * this.config.decoder.num_codebooks
|
|
12606
12636
|
}, () => [decoder_start_token_id]);
|
|
12607
12637
|
|
|
@@ -12931,11 +12961,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
12931
12961
|
async encode_image({ pixel_values }) {
|
|
12932
12962
|
// image_inputs === { pixel_values }
|
|
12933
12963
|
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
|
|
12964
|
+
// @ts-expect-error TS2339
|
|
12934
12965
|
if (!this.config.num_image_tokens) {
|
|
12935
12966
|
console.warn(
|
|
12936
12967
|
'The number of image tokens was not set in the model configuration. ' +
|
|
12937
12968
|
`Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
|
|
12938
12969
|
)
|
|
12970
|
+
// @ts-expect-error TS2339
|
|
12939
12971
|
this.config.num_image_tokens = features.dims[1];
|
|
12940
12972
|
}
|
|
12941
12973
|
return features;
|
|
@@ -14363,6 +14395,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
14363
14395
|
|
|
14364
14396
|
if (generation_config.return_token_timestamps) {
|
|
14365
14397
|
outputs["token_timestamps"] = this._extract_token_timestamps(
|
|
14398
|
+
// @ts-expect-error TS2345
|
|
14366
14399
|
outputs,
|
|
14367
14400
|
generation_config.alignment_heads,
|
|
14368
14401
|
generation_config.num_frames,
|
|
@@ -14398,6 +14431,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
14398
14431
|
);
|
|
14399
14432
|
}
|
|
14400
14433
|
|
|
14434
|
+
// @ts-expect-error TS2339
|
|
14401
14435
|
let median_filter_width = this.config.median_filter_width;
|
|
14402
14436
|
if (median_filter_width === undefined) {
|
|
14403
14437
|
console.warn("Model config has no `median_filter_width`, using default value of 7.")
|
|
@@ -14408,6 +14442,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
14408
14442
|
const batch = generate_outputs.cross_attentions;
|
|
14409
14443
|
// Create a list with `decoder_layers` elements, each a tensor of shape
|
|
14410
14444
|
// (batch size, attention_heads, output length, input length).
|
|
14445
|
+
// @ts-expect-error TS2339
|
|
14411
14446
|
const cross_attentions = Array.from({ length: this.config.decoder_layers },
|
|
14412
14447
|
// Concatenate the cross attentions for each layer across sequence length dimension.
|
|
14413
14448
|
(_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
|
|
@@ -14551,6 +14586,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
|
|
|
14551
14586
|
attention_mask,
|
|
14552
14587
|
}) {
|
|
14553
14588
|
|
|
14589
|
+
// @ts-expect-error TS2339
|
|
14554
14590
|
const image_token_index = this.config.image_token_index;
|
|
14555
14591
|
|
|
14556
14592
|
const idsList = input_ids.tolist();
|
|
@@ -15536,6 +15572,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
15536
15572
|
const image_nums = vision_tokens.filter(x => x == image_token_id).length;
|
|
15537
15573
|
const video_nums = vision_tokens.filter(x => x == video_token_id).length;
|
|
15538
15574
|
|
|
15575
|
+
/** @type {number[][]} */
|
|
15539
15576
|
let llm_pos_ids_list = [];
|
|
15540
15577
|
let st = 0;
|
|
15541
15578
|
let remain_images = image_nums;
|
|
@@ -15605,6 +15642,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
15605
15642
|
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
|
|
15606
15643
|
// meaning to perform concatenation along dim=1, we can do the following:
|
|
15607
15644
|
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
15645
|
+
/** @type {number[]} */
|
|
15608
15646
|
const llm_positions = new Array(num_items);
|
|
15609
15647
|
let index = 0;
|
|
15610
15648
|
for (let x = 0; x < 3; ++x) {
|
|
@@ -15645,9 +15683,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
15645
15683
|
{ length: 3 * data.length },
|
|
15646
15684
|
(_, i) => data[i % data.length]
|
|
15647
15685
|
);
|
|
15686
|
+
/** @type {bigint[]} */
|
|
15648
15687
|
const mrope_position_deltas = Array.from(
|
|
15649
15688
|
{ length: dims[0] },
|
|
15650
|
-
(_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] +
|
|
15689
|
+
(_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
15651
15690
|
);
|
|
15652
15691
|
|
|
15653
15692
|
return [
|
|
@@ -16218,7 +16257,7 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
16218
16257
|
*
|
|
16219
16258
|
* **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
|
|
16220
16259
|
* ```javascript
|
|
16221
|
-
* import { DPTForDepthEstimation, AutoProcessor, RawImage,
|
|
16260
|
+
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
16222
16261
|
*
|
|
16223
16262
|
* // Load model and processor
|
|
16224
16263
|
* const model_id = 'Xenova/dpt-hybrid-midas';
|
|
@@ -16227,7 +16266,7 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
16227
16266
|
*
|
|
16228
16267
|
* // Load image from URL
|
|
16229
16268
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
16230
|
-
* const image = await RawImage.
|
|
16269
|
+
* const image = await RawImage.read(url);
|
|
16231
16270
|
*
|
|
16232
16271
|
* // Prepare image for the model
|
|
16233
16272
|
* const inputs = await processor(image);
|
|
@@ -16236,10 +16275,15 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
16236
16275
|
* const { predicted_depth } = await model(inputs);
|
|
16237
16276
|
*
|
|
16238
16277
|
* // Interpolate to original size
|
|
16239
|
-
* const prediction =
|
|
16278
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
16279
|
+
* size: image.size.reverse(),
|
|
16280
|
+
* mode: 'bilinear',
|
|
16281
|
+
* })).squeeze(1);
|
|
16240
16282
|
*
|
|
16241
16283
|
* // Visualize the prediction
|
|
16242
|
-
* const
|
|
16284
|
+
* const min = prediction.min().item();
|
|
16285
|
+
* const max = prediction.max().item();
|
|
16286
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
16243
16287
|
* const depth = RawImage.fromTensor(formatted);
|
|
16244
16288
|
* // RawImage {
|
|
16245
16289
|
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
@@ -16289,11 +16333,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
|
|
|
16289
16333
|
class GLPNModel extends GLPNPreTrainedModel { }
|
|
16290
16334
|
|
|
16291
16335
|
/**
|
|
16292
|
-
*
|
|
16293
|
-
*
|
|
16294
|
-
* **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
|
|
16295
|
-
* ```javascript
|
|
16296
|
-
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
|
|
16336
|
+
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
16297
16337
|
*
|
|
16298
16338
|
* // Load model and processor
|
|
16299
16339
|
* const model_id = 'Xenova/glpn-kitti';
|
|
@@ -16302,7 +16342,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
16302
16342
|
*
|
|
16303
16343
|
* // Load image from URL
|
|
16304
16344
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
16305
|
-
* const image = await RawImage.
|
|
16345
|
+
* const image = await RawImage.read(url);
|
|
16306
16346
|
*
|
|
16307
16347
|
* // Prepare image for the model
|
|
16308
16348
|
* const inputs = await processor(image);
|
|
@@ -16311,13 +16351,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
16311
16351
|
* const { predicted_depth } = await model(inputs);
|
|
16312
16352
|
*
|
|
16313
16353
|
* // Interpolate to original size
|
|
16314
|
-
* const prediction =
|
|
16354
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
16355
|
+
* size: image.size.reverse(),
|
|
16356
|
+
* mode: 'bilinear',
|
|
16357
|
+
* })).squeeze(1);
|
|
16315
16358
|
*
|
|
16316
16359
|
* // Visualize the prediction
|
|
16317
|
-
* const
|
|
16360
|
+
* const min = prediction.min().item();
|
|
16361
|
+
* const max = prediction.max().item();
|
|
16362
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
16318
16363
|
* const depth = RawImage.fromTensor(formatted);
|
|
16319
16364
|
* // RawImage {
|
|
16320
|
-
* // data: Uint8Array(307200) [
|
|
16365
|
+
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
16321
16366
|
* // width: 640,
|
|
16322
16367
|
* // height: 480,
|
|
16323
16368
|
* // channels: 1
|
|
@@ -17284,10 +17329,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
|
|
|
17284
17329
|
|
|
17285
17330
|
const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
|
|
17286
17331
|
|
|
17332
|
+
// @ts-expect-error TS2339
|
|
17287
17333
|
const r = encoder_outputs.dims[1] / this.config.reduction_factor;
|
|
17288
17334
|
const maxlen = Math.floor(r * maxlenratio);
|
|
17289
17335
|
const minlen = Math.floor(r * minlenratio);
|
|
17290
17336
|
|
|
17337
|
+
// @ts-expect-error TS2339
|
|
17291
17338
|
const num_mel_bins = this.config.num_mel_bins;
|
|
17292
17339
|
|
|
17293
17340
|
let spectrogramParts = [];
|
|
@@ -17652,11 +17699,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
|
|
|
17652
17699
|
*/
|
|
17653
17700
|
_apply_and_filter_by_delay_pattern_mask(outputs) {
|
|
17654
17701
|
const [bs_x_codebooks, seqLength] = outputs.dims;
|
|
17702
|
+
// @ts-expect-error TS2339
|
|
17655
17703
|
const num_codebooks = this.config.decoder.num_codebooks;
|
|
17656
17704
|
const upperBound = (seqLength - num_codebooks);
|
|
17657
17705
|
|
|
17658
17706
|
let newDataSize = 0;
|
|
17659
17707
|
for (let i = 0; i < outputs.size; ++i) {
|
|
17708
|
+
// @ts-expect-error TS2339
|
|
17660
17709
|
if (outputs.data[i] === this.config.decoder.pad_token_id) {
|
|
17661
17710
|
continue;
|
|
17662
17711
|
}
|
|
@@ -17686,7 +17735,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
|
|
|
17686
17735
|
let clonedInputIds = structuredClone(input_ids);
|
|
17687
17736
|
for (let i = 0; i < clonedInputIds.length; ++i) {
|
|
17688
17737
|
for (let j = 0; j < clonedInputIds[i].length; ++j) {
|
|
17738
|
+
// @ts-expect-error TS2339
|
|
17689
17739
|
if ((i % this.config.decoder.num_codebooks) >= j) {
|
|
17740
|
+
// @ts-expect-error TS2339
|
|
17690
17741
|
clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
|
|
17691
17742
|
}
|
|
17692
17743
|
}
|
|
@@ -17843,6 +17894,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
|
|
|
17843
17894
|
'past_key_values',
|
|
17844
17895
|
];
|
|
17845
17896
|
|
|
17897
|
+
/**
|
|
17898
|
+
* @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
|
|
17899
|
+
*/
|
|
17846
17900
|
constructor(...args) {
|
|
17847
17901
|
super(...args);
|
|
17848
17902
|
|
|
@@ -18811,10 +18865,17 @@ class SequenceClassifierOutput extends ModelOutput {
|
|
|
18811
18865
|
/**
|
|
18812
18866
|
* @param {Object} output The output of the model.
|
|
18813
18867
|
* @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
|
|
18868
|
+
* @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
|
|
18869
|
+
* Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
|
18814
18870
|
*/
|
|
18815
|
-
constructor({ logits }) {
|
|
18871
|
+
constructor({ logits, ...attentions }) {
|
|
18816
18872
|
super();
|
|
18817
18873
|
this.logits = logits;
|
|
18874
|
+
const attentions_list = Object.values(attentions);
|
|
18875
|
+
if (attentions_list.length > 0) {
|
|
18876
|
+
// Only set attentions if they are not empty
|
|
18877
|
+
this.attentions = attentions_list;
|
|
18878
|
+
}
|
|
18818
18879
|
}
|
|
18819
18880
|
}
|
|
18820
18881
|
|
|
@@ -19070,22 +19131,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
19070
19131
|
|
|
19071
19132
|
class AutoFeatureExtractor {
|
|
19072
19133
|
|
|
19073
|
-
/**
|
|
19074
|
-
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
19075
|
-
*
|
|
19076
|
-
* The processor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
19077
|
-
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
19078
|
-
*
|
|
19079
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
19080
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
19081
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
19082
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
19083
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
19084
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
19085
|
-
*
|
|
19086
|
-
* @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
|
|
19087
|
-
*/
|
|
19088
|
-
|
|
19089
19134
|
/** @type {typeof FeatureExtractor.from_pretrained} */
|
|
19090
19135
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
19091
19136
|
|
|
@@ -19212,22 +19257,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
19212
19257
|
*/
|
|
19213
19258
|
class AutoProcessor {
|
|
19214
19259
|
|
|
19215
|
-
/**
|
|
19216
|
-
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
19217
|
-
*
|
|
19218
|
-
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
19219
|
-
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
19220
|
-
*
|
|
19221
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
19222
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
19223
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
19224
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
19225
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
19226
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
19227
|
-
*
|
|
19228
|
-
* @returns {Promise<Processor>} A new instance of the Processor class.
|
|
19229
|
-
*/
|
|
19230
|
-
|
|
19231
19260
|
/** @type {typeof Processor.from_pretrained} */
|
|
19232
19261
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
19233
19262
|
|
|
@@ -19545,6 +19574,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
|
|
|
19545
19574
|
/**
|
|
19546
19575
|
* Percentage of the image to crop. Only has an effect if this.size < 384.
|
|
19547
19576
|
*/
|
|
19577
|
+
// @ts-expect-error TS2339
|
|
19548
19578
|
this.crop_pct = this.config.crop_pct ?? (224 / 256);
|
|
19549
19579
|
}
|
|
19550
19580
|
|
|
@@ -19747,6 +19777,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
19747
19777
|
class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
|
|
19748
19778
|
constructor(config) {
|
|
19749
19779
|
super(config);
|
|
19780
|
+
// @ts-expect-error TS2339
|
|
19750
19781
|
this.include_top = this.config.include_top ?? true;
|
|
19751
19782
|
if (this.include_top) {
|
|
19752
19783
|
this.image_std = this.image_std.map(x => x * x);
|
|
@@ -19828,8 +19859,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
19828
19859
|
super(config, components);
|
|
19829
19860
|
|
|
19830
19861
|
const {
|
|
19862
|
+
// @ts-expect-error TS2339
|
|
19831
19863
|
tasks_answer_post_processing_type,
|
|
19864
|
+
// @ts-expect-error TS2339
|
|
19832
19865
|
task_prompts_without_inputs,
|
|
19866
|
+
// @ts-expect-error TS2339
|
|
19833
19867
|
task_prompts_with_input,
|
|
19834
19868
|
} = this.image_processor.config;
|
|
19835
19869
|
|
|
@@ -20124,6 +20158,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
|
|
|
20124
20158
|
|
|
20125
20159
|
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
20126
20160
|
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
20161
|
+
|
|
20162
|
+
// @ts-expect-error
|
|
20127
20163
|
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
20128
20164
|
}
|
|
20129
20165
|
}
|
|
@@ -20530,6 +20566,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
|
|
|
20530
20566
|
},
|
|
20531
20567
|
...config,
|
|
20532
20568
|
});
|
|
20569
|
+
// @ts-expect-error TS2339
|
|
20533
20570
|
this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
|
|
20534
20571
|
}
|
|
20535
20572
|
|
|
@@ -20971,6 +21008,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
|
|
|
20971
21008
|
* - bpe_preds: The list of BPE decoded sentences.
|
|
20972
21009
|
* - wp_preds: The list of wp decoded sentences.
|
|
20973
21010
|
*/
|
|
21011
|
+
// @ts-expect-error The type of this method is not compatible with the one
|
|
21012
|
+
// in the base class. It might be a good idea to fix this.
|
|
20974
21013
|
batch_decode([char_logits, bpe_logits, wp_logits]) {
|
|
20975
21014
|
const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
|
|
20976
21015
|
const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
|
|
@@ -21352,6 +21391,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
21352
21391
|
}
|
|
21353
21392
|
|
|
21354
21393
|
const bos_token = this.tokenizer.bos_token;
|
|
21394
|
+
// @ts-expect-error TS2339
|
|
21355
21395
|
const image_seq_length = this.image_processor.config.image_seq_length;
|
|
21356
21396
|
let input_strings;
|
|
21357
21397
|
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
|
|
@@ -21602,7 +21642,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
|
|
|
21602
21642
|
*
|
|
21603
21643
|
* @param {string|string[]} text
|
|
21604
21644
|
* @param {RawImage|RawImage[]} images
|
|
21605
|
-
* @param {
|
|
21645
|
+
* @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
|
|
21606
21646
|
* @returns {Promise<any>}
|
|
21607
21647
|
*/
|
|
21608
21648
|
async _call(text, images = null, {
|
|
@@ -21786,6 +21826,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
|
|
|
21786
21826
|
|
|
21787
21827
|
let current_speaker = -1;
|
|
21788
21828
|
for (let i = 0; i < scores.length; ++i) {
|
|
21829
|
+
/** @type {number[]} */
|
|
21789
21830
|
const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
|
|
21790
21831
|
const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
|
|
21791
21832
|
const [start, end] = [i, i + 1];
|
|
@@ -21970,6 +22011,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
|
|
|
21970
22011
|
}
|
|
21971
22012
|
|
|
21972
22013
|
if (image_grid_thw) {
|
|
22014
|
+
// @ts-expect-error TS2551
|
|
21973
22015
|
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21974
22016
|
let index = 0;
|
|
21975
22017
|
|
|
@@ -22457,8 +22499,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
|
|
|
22457
22499
|
'int64',
|
|
22458
22500
|
new BigInt64Array(numPaddedFrames),
|
|
22459
22501
|
[1, numPaddedFrames],
|
|
22460
|
-
)
|
|
22461
|
-
padded_attention_mask.data.fill(1n, 0, num_frames);
|
|
22502
|
+
);
|
|
22503
|
+
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
|
|
22462
22504
|
}
|
|
22463
22505
|
}
|
|
22464
22506
|
}
|
|
@@ -23258,7 +23300,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
|
|
|
23258
23300
|
)
|
|
23259
23301
|
|
|
23260
23302
|
const data = features.data;
|
|
23261
|
-
const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
|
|
23303
|
+
const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
|
|
23262
23304
|
|
|
23263
23305
|
for (let i = 0; i < data.length; ++i) {
|
|
23264
23306
|
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|
|
@@ -23517,6 +23559,16 @@ class TensorOpRegistry {
|
|
|
23517
23559
|
// executionProviders: ['webgpu'],
|
|
23518
23560
|
};
|
|
23519
23561
|
|
|
23562
|
+
static get nearest_interpolate_4d() {
|
|
23563
|
+
if (!this._nearest_interpolate_4d) {
|
|
23564
|
+
this._nearest_interpolate_4d = wrap(
|
|
23565
|
+
[8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
|
|
23566
|
+
this.session_options,
|
|
23567
|
+
'y',
|
|
23568
|
+
);
|
|
23569
|
+
}
|
|
23570
|
+
return this._nearest_interpolate_4d;
|
|
23571
|
+
}
|
|
23520
23572
|
static get bilinear_interpolate_4d() {
|
|
23521
23573
|
if (!this._bilinear_interpolate_4d) {
|
|
23522
23574
|
this._bilinear_interpolate_4d = wrap(
|
|
@@ -23890,6 +23942,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
|
|
|
23890
23942
|
|
|
23891
23943
|
// TODO: Use softmax tensor function
|
|
23892
23944
|
const function_to_apply =
|
|
23945
|
+
// @ts-expect-error TS2339
|
|
23893
23946
|
this.model.config.problem_type === 'multi_label_classification'
|
|
23894
23947
|
? batch => batch.sigmoid()
|
|
23895
23948
|
: batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
|
|
@@ -23898,6 +23951,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
|
|
|
23898
23951
|
batch.dims,
|
|
23899
23952
|
); // single_label_classification (default)
|
|
23900
23953
|
|
|
23954
|
+
// @ts-expect-error TS2339
|
|
23901
23955
|
const id2label = this.model.config.id2label;
|
|
23902
23956
|
|
|
23903
23957
|
const toReturn = [];
|
|
@@ -24000,6 +24054,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
|
|
|
24000
24054
|
const outputs = await this.model(model_inputs)
|
|
24001
24055
|
|
|
24002
24056
|
const logits = outputs.logits;
|
|
24057
|
+
// @ts-expect-error TS2339
|
|
24003
24058
|
const id2label = this.model.config.id2label;
|
|
24004
24059
|
|
|
24005
24060
|
const toReturn = [];
|
|
@@ -24339,11 +24394,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
|
|
|
24339
24394
|
|
|
24340
24395
|
|
|
24341
24396
|
// Add global prefix, if present
|
|
24397
|
+
// @ts-expect-error TS2339
|
|
24342
24398
|
if (this.model.config.prefix) {
|
|
24399
|
+
// @ts-expect-error TS2339
|
|
24343
24400
|
texts = texts.map(x => this.model.config.prefix + x)
|
|
24344
24401
|
}
|
|
24345
24402
|
|
|
24346
24403
|
// Handle task specific params:
|
|
24404
|
+
// @ts-expect-error TS2339
|
|
24347
24405
|
const task_specific_params = this.model.config.task_specific_params
|
|
24348
24406
|
if (task_specific_params && task_specific_params[this.task]) {
|
|
24349
24407
|
// Add prefixes, if present
|
|
@@ -25082,6 +25140,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
|
|
|
25082
25140
|
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
25083
25141
|
const preparedAudios = await prepareAudios(audio, sampling_rate);
|
|
25084
25142
|
|
|
25143
|
+
// @ts-expect-error TS2339
|
|
25085
25144
|
const id2label = this.model.config.id2label;
|
|
25086
25145
|
|
|
25087
25146
|
const toReturn = [];
|
|
@@ -25392,6 +25451,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
25392
25451
|
audio = [/** @type {AudioInput} */ (audio)];
|
|
25393
25452
|
}
|
|
25394
25453
|
|
|
25454
|
+
// @ts-expect-error TS2339
|
|
25395
25455
|
const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
|
|
25396
25456
|
const hop_length = this.processor.feature_extractor.config.hop_length;
|
|
25397
25457
|
|
|
@@ -25457,7 +25517,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
25457
25517
|
|
|
25458
25518
|
// TODO: Right now we only get top beam
|
|
25459
25519
|
if (return_timestamps === 'word') {
|
|
25520
|
+
// @ts-expect-error TS2339
|
|
25460
25521
|
chunk.tokens = data.sequences.tolist()[0];
|
|
25522
|
+
// @ts-expect-error TS2339
|
|
25461
25523
|
chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
|
|
25462
25524
|
(/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
|
|
25463
25525
|
);
|
|
@@ -25502,7 +25564,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
25502
25564
|
const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
|
|
25503
25565
|
const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
|
|
25504
25566
|
|
|
25505
|
-
const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
|
|
25567
|
+
const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
|
|
25506
25568
|
toReturn.push({ text });
|
|
25507
25569
|
}
|
|
25508
25570
|
return single ? toReturn[0] : toReturn;
|
|
@@ -25651,6 +25713,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
|
|
|
25651
25713
|
const { pixel_values } = await this.processor(preparedImages);
|
|
25652
25714
|
const output = await this.model({ pixel_values });
|
|
25653
25715
|
|
|
25716
|
+
// @ts-expect-error TS2339
|
|
25654
25717
|
const id2label = this.model.config.id2label;
|
|
25655
25718
|
|
|
25656
25719
|
/** @type {ImageClassificationOutput[]} */
|
|
@@ -25765,6 +25828,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
|
|
|
25765
25828
|
}
|
|
25766
25829
|
}
|
|
25767
25830
|
|
|
25831
|
+
// @ts-expect-error TS2339
|
|
25768
25832
|
const id2label = this.model.config.id2label;
|
|
25769
25833
|
|
|
25770
25834
|
/** @type {ImageSegmentationPipelineOutput[]} */
|
|
@@ -25991,6 +26055,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
|
|
|
25991
26055
|
const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
|
|
25992
26056
|
|
|
25993
26057
|
// Add labels
|
|
26058
|
+
// @ts-expect-error TS2339
|
|
25994
26059
|
const id2label = this.model.config.id2label;
|
|
25995
26060
|
|
|
25996
26061
|
// Format output
|
|
@@ -26210,6 +26275,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
|
|
|
26210
26275
|
// Run model
|
|
26211
26276
|
const output = await this.model.generate({
|
|
26212
26277
|
inputs: pixel_values,
|
|
26278
|
+
// @ts-expect-error TS2339
|
|
26213
26279
|
max_length: this.model.config.decoder.max_position_embeddings,
|
|
26214
26280
|
decoder_input_ids,
|
|
26215
26281
|
...generate_kwargs,
|
|
@@ -26325,6 +26391,7 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
|
|
|
26325
26391
|
// Generate waveform
|
|
26326
26392
|
const { waveform } = await this.model(inputs);
|
|
26327
26393
|
|
|
26394
|
+
// @ts-expect-error TS2339
|
|
26328
26395
|
const sampling_rate = this.model.config.sampling_rate;
|
|
26329
26396
|
return {
|
|
26330
26397
|
audio: waveform.data,
|
|
@@ -26482,11 +26549,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
|
|
|
26482
26549
|
|
|
26483
26550
|
const toReturn = [];
|
|
26484
26551
|
for (let i = 0; i < preparedImages.length; ++i) {
|
|
26485
|
-
const
|
|
26486
|
-
const
|
|
26552
|
+
const batch = predicted_depth[i];
|
|
26553
|
+
const [height, width] = batch.dims.slice(-2);
|
|
26554
|
+
const [new_width, new_height] = preparedImages[i].size;
|
|
26555
|
+
|
|
26556
|
+
// Interpolate to original size
|
|
26557
|
+
const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
|
|
26558
|
+
size: [new_height, new_width],
|
|
26559
|
+
mode: 'bilinear',
|
|
26560
|
+
})).view(new_height, new_width);
|
|
26561
|
+
|
|
26562
|
+
const minval = /** @type {number} */(prediction.min().item());
|
|
26563
|
+
const maxval = /** @type {number} */(prediction.max().item());
|
|
26564
|
+
const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
|
|
26565
|
+
const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
|
|
26487
26566
|
toReturn.push({
|
|
26488
|
-
predicted_depth:
|
|
26489
|
-
depth
|
|
26567
|
+
predicted_depth: prediction,
|
|
26568
|
+
depth,
|
|
26490
26569
|
});
|
|
26491
26570
|
}
|
|
26492
26571
|
|
|
@@ -26966,6 +27045,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
|
|
|
26966
27045
|
return result;
|
|
26967
27046
|
}
|
|
26968
27047
|
|
|
27048
|
+
|
|
26969
27049
|
/***/ }),
|
|
26970
27050
|
|
|
26971
27051
|
/***/ "./src/tokenizers.js":
|
|
@@ -27034,7 +27114,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
27034
27114
|
/* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
|
|
27035
27115
|
/* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
|
|
27036
27116
|
/* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
|
|
27037
|
-
/* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
|
|
27038
27117
|
|
|
27039
27118
|
/**
|
|
27040
27119
|
* @file Tokenizers are used to prepare textual inputs for a model.
|
|
@@ -27071,7 +27150,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
27071
27150
|
|
|
27072
27151
|
|
|
27073
27152
|
|
|
27074
|
-
|
|
27075
27153
|
/**
|
|
27076
27154
|
* @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
|
|
27077
27155
|
* @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
|
|
@@ -27555,7 +27633,7 @@ class Unigram extends TokenizerModel {
|
|
|
27555
27633
|
* Create a new Unigram tokenizer model.
|
|
27556
27634
|
* @param {Object} config The configuration object for the Unigram model.
|
|
27557
27635
|
* @param {number} config.unk_id The ID of the unknown token
|
|
27558
|
-
* @param {
|
|
27636
|
+
* @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
|
|
27559
27637
|
* @param {Object} moreConfig Additional configuration object for the Unigram model.
|
|
27560
27638
|
*/
|
|
27561
27639
|
constructor(config, moreConfig) {
|
|
@@ -27563,11 +27641,10 @@ class Unigram extends TokenizerModel {
|
|
|
27563
27641
|
|
|
27564
27642
|
const vocabSize = config.vocab.length;
|
|
27565
27643
|
this.vocab = new Array(vocabSize);
|
|
27644
|
+
/** @type {number[]} */
|
|
27566
27645
|
this.scores = new Array(vocabSize);
|
|
27567
27646
|
for (let i = 0; i < vocabSize; ++i) {
|
|
27568
|
-
|
|
27569
|
-
this.vocab[i] = piece[0];
|
|
27570
|
-
this.scores[i] = piece[1];
|
|
27647
|
+
[this.vocab[i], this.scores[i]] = config.vocab[i];
|
|
27571
27648
|
}
|
|
27572
27649
|
|
|
27573
27650
|
this.unk_token_id = config.unk_id;
|
|
@@ -32924,6 +33001,8 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
32924
33001
|
/* harmony export */ });
|
|
32925
33002
|
/* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
|
|
32926
33003
|
/* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
|
|
33004
|
+
/// <reference types="@webgpu/types" />
|
|
33005
|
+
|
|
32927
33006
|
|
|
32928
33007
|
|
|
32929
33008
|
|
|
@@ -33177,7 +33256,7 @@ class FileResponse {
|
|
|
33177
33256
|
*/
|
|
33178
33257
|
async arrayBuffer() {
|
|
33179
33258
|
const data = await fs__WEBPACK_IMPORTED_MODULE_0__.promises.readFile(this.filePath);
|
|
33180
|
-
return data.buffer;
|
|
33259
|
+
return /** @type {ArrayBuffer} */ (data.buffer);
|
|
33181
33260
|
}
|
|
33182
33261
|
|
|
33183
33262
|
/**
|
|
@@ -34836,8 +34915,9 @@ function magnitude(arr) {
|
|
|
34836
34915
|
|
|
34837
34916
|
/**
|
|
34838
34917
|
* Returns the value and index of the minimum element in an array.
|
|
34839
|
-
* @
|
|
34840
|
-
* @
|
|
34918
|
+
* @template {number[]|bigint[]|AnyTypedArray} T
|
|
34919
|
+
* @param {T} arr array of numbers.
|
|
34920
|
+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
|
|
34841
34921
|
* @throws {Error} If array is empty.
|
|
34842
34922
|
*/
|
|
34843
34923
|
function min(arr) {
|
|
@@ -34850,14 +34930,15 @@ function min(arr) {
|
|
|
34850
34930
|
indexOfMin = i;
|
|
34851
34931
|
}
|
|
34852
34932
|
}
|
|
34853
|
-
return [min, indexOfMin];
|
|
34933
|
+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
|
|
34854
34934
|
}
|
|
34855
34935
|
|
|
34856
34936
|
|
|
34857
34937
|
/**
|
|
34858
34938
|
* Returns the value and index of the maximum element in an array.
|
|
34859
|
-
* @
|
|
34860
|
-
* @
|
|
34939
|
+
* @template {number[]|bigint[]|AnyTypedArray} T
|
|
34940
|
+
* @param {T} arr array of numbers.
|
|
34941
|
+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
|
|
34861
34942
|
* @throws {Error} If array is empty.
|
|
34862
34943
|
*/
|
|
34863
34944
|
function max(arr) {
|
|
@@ -34870,7 +34951,7 @@ function max(arr) {
|
|
|
34870
34951
|
indexOfMax = i;
|
|
34871
34952
|
}
|
|
34872
34953
|
}
|
|
34873
|
-
return [
|
|
34954
|
+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
|
|
34874
34955
|
}
|
|
34875
34956
|
|
|
34876
34957
|
function isPowerOfTwo(number) {
|
|
@@ -36167,8 +36248,6 @@ class Tensor {
|
|
|
36167
36248
|
return this.permute(...dims);
|
|
36168
36249
|
}
|
|
36169
36250
|
|
|
36170
|
-
// TODO add .max() and .min() methods
|
|
36171
|
-
|
|
36172
36251
|
/**
|
|
36173
36252
|
* Returns the sum of each row of the input tensor in the given dimension dim.
|
|
36174
36253
|
*
|
|
@@ -36462,6 +36541,36 @@ class Tensor {
|
|
|
36462
36541
|
return mean(this, dim, keepdim);
|
|
36463
36542
|
}
|
|
36464
36543
|
|
|
36544
|
+
min(dim = null, keepdim = false) {
|
|
36545
|
+
if (dim !== null) {
|
|
36546
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
36547
|
+
}
|
|
36548
|
+
const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
|
|
36549
|
+
return new Tensor(this.type, [value], []);
|
|
36550
|
+
}
|
|
36551
|
+
max(dim = null, keepdim = false) {
|
|
36552
|
+
if (dim !== null) {
|
|
36553
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
36554
|
+
}
|
|
36555
|
+
const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
|
|
36556
|
+
return new Tensor(this.type, [value], []);
|
|
36557
|
+
}
|
|
36558
|
+
|
|
36559
|
+
argmin(dim = null, keepdim = false) {
|
|
36560
|
+
if (dim !== null) {
|
|
36561
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
36562
|
+
}
|
|
36563
|
+
const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
|
|
36564
|
+
return new Tensor('int64', [BigInt(index)], []);
|
|
36565
|
+
}
|
|
36566
|
+
argmax(dim = null, keepdim = false) {
|
|
36567
|
+
if (dim !== null) {
|
|
36568
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
36569
|
+
}
|
|
36570
|
+
const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
|
|
36571
|
+
return new Tensor('int64', [BigInt(index)], []);
|
|
36572
|
+
}
|
|
36573
|
+
|
|
36465
36574
|
/**
|
|
36466
36575
|
* Performs Tensor dtype conversion.
|
|
36467
36576
|
* @param {DataType} type The desired data type.
|
|
@@ -36595,7 +36704,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
|
|
|
36595
36704
|
* @param {Tensor} input the input tensor
|
|
36596
36705
|
* @param {Object} options the options for the interpolation
|
|
36597
36706
|
* @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
|
|
36598
|
-
* @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
|
|
36707
|
+
* @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
|
|
36599
36708
|
* @returns {Promise<Tensor>} The interpolated tensor.
|
|
36600
36709
|
*/
|
|
36601
36710
|
async function interpolate_4d(input, {
|
|
@@ -36625,7 +36734,9 @@ async function interpolate_4d(input, {
|
|
|
36625
36734
|
}
|
|
36626
36735
|
|
|
36627
36736
|
let op;
|
|
36628
|
-
if (mode === '
|
|
36737
|
+
if (mode === 'nearest') {
|
|
36738
|
+
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
|
|
36739
|
+
} else if (mode === 'bilinear') {
|
|
36629
36740
|
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
|
|
36630
36741
|
} else if (mode === 'bicubic') {
|
|
36631
36742
|
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
|
|
@@ -36666,13 +36777,13 @@ async function rfft(x, a) {
|
|
|
36666
36777
|
* Returns the k largest elements of the given input tensor.
|
|
36667
36778
|
* Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
|
|
36668
36779
|
* @param {Tensor} x the input tensor
|
|
36669
|
-
* @param {number} k the k in "top-k"
|
|
36780
|
+
* @param {number} [k] the k in "top-k"
|
|
36670
36781
|
* @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
|
|
36671
36782
|
*/
|
|
36672
36783
|
async function topk(x, k) {
|
|
36673
36784
|
const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
|
|
36674
36785
|
|
|
36675
|
-
if (k
|
|
36786
|
+
if (k == null) {
|
|
36676
36787
|
k = x.dims.at(-1);
|
|
36677
36788
|
} else {
|
|
36678
36789
|
k = Math.min(k, x.dims.at(-1));
|
|
@@ -36701,10 +36812,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
|
|
|
36701
36812
|
async function slice(data, starts, ends, axes, steps) {
|
|
36702
36813
|
const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
|
|
36703
36814
|
return await op({
|
|
36704
|
-
x: data,
|
|
36705
|
-
s: arrayToIndexTensor(starts),
|
|
36706
|
-
e: arrayToIndexTensor(ends),
|
|
36707
|
-
a: arrayToIndexTensor(axes),
|
|
36815
|
+
x: data,
|
|
36816
|
+
s: arrayToIndexTensor(starts),
|
|
36817
|
+
e: arrayToIndexTensor(ends),
|
|
36818
|
+
a: arrayToIndexTensor(axes),
|
|
36708
36819
|
t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
|
|
36709
36820
|
});
|
|
36710
36821
|
}
|