@huggingface/transformers 3.2.3 → 3.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/transformers.cjs +203 -92
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +203 -92
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +203 -92
- package/dist/transformers.mjs.map +1 -1
- package/package.json +2 -2
- package/src/base/feature_extraction_utils.js +9 -9
- package/src/base/image_processors_utils.js +11 -0
- package/src/base/processing_utils.js +13 -3
- package/src/configs.js +5 -0
- package/src/env.js +1 -1
- package/src/models/auto/feature_extraction_auto.js +0 -16
- package/src/models/auto/processing_auto.js +0 -16
- package/src/models/convnext/image_processing_convnext.js +1 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +1 -0
- package/src/models/florence2/processing_florence2.js +3 -0
- package/src/models/idefics3/image_processing_idefics3.js +2 -0
- package/src/models/janus/image_processing_janus.js +1 -0
- package/src/models/mgp_str/processing_mgp_str.js +2 -0
- package/src/models/paligemma/processing_paligemma.js +1 -0
- package/src/models/phi3_v/processing_phi3_v.js +1 -1
- package/src/models/pyannote/feature_extraction_pyannote.js +1 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +1 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +2 -2
- package/src/models/whisper/feature_extraction_whisper.js +1 -1
- package/src/models.js +50 -15
- package/src/ops/registry.js +10 -0
- package/src/pipelines.js +34 -7
- package/src/tokenizers.js +4 -7
- package/src/utils/dtypes.js +2 -0
- package/src/utils/hub.js +1 -1
- package/src/utils/maths.js +8 -6
- package/src/utils/tensor.js +42 -10
- package/types/base/feature_extraction_utils.d.ts +7 -7
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +17 -19
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -1
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -1
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
- package/types/models/phi3_v/processing_phi3_v.d.ts +6 -2
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -1
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/sapiens/image_processing_sapiens.d.ts +10 -0
- package/types/models/sapiens/image_processing_sapiens.d.ts.map +1 -0
- package/types/models/whisper/generation_whisper.d.ts +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models.d.ts +32 -17
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +2 -2
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/tsconfig.tsbuildinfo +1 -0
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +3 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +8 -6
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +8 -4
- package/types/utils/tensor.d.ts.map +1 -1
package/dist/transformers.cjs
CHANGED
|
@@ -4158,23 +4158,23 @@ class FeatureExtractor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Ca
|
|
|
4158
4158
|
}
|
|
4159
4159
|
|
|
4160
4160
|
/**
|
|
4161
|
-
* Instantiate one of the
|
|
4161
|
+
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
4162
4162
|
*
|
|
4163
|
-
* The
|
|
4164
|
-
*
|
|
4163
|
+
* The feature extractor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
4164
|
+
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
4165
4165
|
*
|
|
4166
4166
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
4167
|
-
* - A string, the *model id* of a pretrained
|
|
4167
|
+
* - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on huggingface.co.
|
|
4168
4168
|
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
4169
4169
|
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
4170
|
-
* - A path to a *directory* containing
|
|
4171
|
-
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the
|
|
4170
|
+
* - A path to a *directory* containing feature_extractor files, e.g., `./my_model_directory/`.
|
|
4171
|
+
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the feature_extractor.
|
|
4172
4172
|
*
|
|
4173
|
-
* @returns {Promise<FeatureExtractor>} A new instance of the
|
|
4173
|
+
* @returns {Promise<FeatureExtractor>} A new instance of the Feature Extractor class.
|
|
4174
4174
|
*/
|
|
4175
4175
|
static async from_pretrained(pretrained_model_name_or_path, options) {
|
|
4176
|
-
const
|
|
4177
|
-
return new this(
|
|
4176
|
+
const config = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_2__.getModelJSON)(pretrained_model_name_or_path, _utils_constants_js__WEBPACK_IMPORTED_MODULE_0__.FEATURE_EXTRACTOR_NAME, true, options);
|
|
4177
|
+
return new this(config);
|
|
4178
4178
|
}
|
|
4179
4179
|
}
|
|
4180
4180
|
|
|
@@ -4825,14 +4825,20 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
4825
4825
|
this.do_thumbnail = config.do_thumbnail;
|
|
4826
4826
|
this.size = config.size ?? config.image_size;
|
|
4827
4827
|
this.do_resize = config.do_resize ?? (this.size !== undefined);
|
|
4828
|
+
// @ts-expect-error TS2339
|
|
4828
4829
|
this.size_divisibility = config.size_divisibility ?? config.size_divisor;
|
|
4829
4830
|
|
|
4830
4831
|
this.do_center_crop = config.do_center_crop;
|
|
4832
|
+
// @ts-expect-error TS2339
|
|
4831
4833
|
this.crop_size = config.crop_size;
|
|
4834
|
+
// @ts-expect-error TS2339
|
|
4832
4835
|
this.do_convert_rgb = config.do_convert_rgb ?? true;
|
|
4836
|
+
// @ts-expect-error TS2339
|
|
4833
4837
|
this.do_crop_margin = config.do_crop_margin;
|
|
4834
4838
|
|
|
4839
|
+
// @ts-expect-error TS2339
|
|
4835
4840
|
this.pad_size = config.pad_size;
|
|
4841
|
+
// @ts-expect-error TS2339
|
|
4836
4842
|
this.do_pad = config.do_pad;
|
|
4837
4843
|
|
|
4838
4844
|
if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
|
|
@@ -5041,6 +5047,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5041
5047
|
// Support both formats for backwards compatibility
|
|
5042
5048
|
else if (Number.isInteger(size)) {
|
|
5043
5049
|
shortest_edge = size;
|
|
5050
|
+
// @ts-expect-error TS2339
|
|
5044
5051
|
longest_edge = this.config.max_size ?? shortest_edge;
|
|
5045
5052
|
|
|
5046
5053
|
} else if (size !== undefined) {
|
|
@@ -5109,6 +5116,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5109
5116
|
} else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
|
|
5110
5117
|
// Custom resize logic for Qwen2-VL models
|
|
5111
5118
|
const { min_pixels, max_pixels } = size;
|
|
5119
|
+
// @ts-expect-error TS2339
|
|
5112
5120
|
const factor = this.config.patch_size * this.config.merge_size;
|
|
5113
5121
|
return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
|
|
5114
5122
|
} else {
|
|
@@ -5124,6 +5132,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5124
5132
|
async resize(image) {
|
|
5125
5133
|
const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
|
|
5126
5134
|
return await image.resize(newWidth, newHeight, {
|
|
5135
|
+
// @ts-expect-error TS2322
|
|
5127
5136
|
resample: this.resample,
|
|
5128
5137
|
});
|
|
5129
5138
|
}
|
|
@@ -5174,6 +5183,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5174
5183
|
|
|
5175
5184
|
// Resize the image using thumbnail method.
|
|
5176
5185
|
if (this.do_thumbnail) {
|
|
5186
|
+
// @ts-expect-error TS2345
|
|
5177
5187
|
image = await this.thumbnail(image, this.size, this.resample);
|
|
5178
5188
|
}
|
|
5179
5189
|
|
|
@@ -5198,6 +5208,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
5198
5208
|
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
|
|
5199
5209
|
// occurs with data in the hwc format (height, width, channels),
|
|
5200
5210
|
// to emulate the behavior of the original Python code (w/ numpy).
|
|
5211
|
+
/** @type {Float32Array} */
|
|
5201
5212
|
let pixelData = Float32Array.from(image.data);
|
|
5202
5213
|
let imgDims = [image.height, image.width, image.channels];
|
|
5203
5214
|
|
|
@@ -5356,6 +5367,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
5356
5367
|
/**
|
|
5357
5368
|
* @typedef {Object} ProcessorProperties Additional processor-specific properties.
|
|
5358
5369
|
* @typedef {import('../utils/hub.js').PretrainedOptions & ProcessorProperties} PretrainedProcessorOptions
|
|
5370
|
+
* @typedef {import('../tokenizers.js').PreTrainedTokenizer} PreTrainedTokenizer
|
|
5359
5371
|
*/
|
|
5360
5372
|
|
|
5361
5373
|
|
|
@@ -5389,7 +5401,7 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5389
5401
|
}
|
|
5390
5402
|
|
|
5391
5403
|
/**
|
|
5392
|
-
* @returns {
|
|
5404
|
+
* @returns {PreTrainedTokenizer|undefined} The tokenizer of the processor, if it exists.
|
|
5393
5405
|
*/
|
|
5394
5406
|
get tokenizer() {
|
|
5395
5407
|
return this.components.tokenizer;
|
|
@@ -5402,6 +5414,11 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5402
5414
|
return this.components.feature_extractor;
|
|
5403
5415
|
}
|
|
5404
5416
|
|
|
5417
|
+
/**
|
|
5418
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[0]} messages
|
|
5419
|
+
* @param {Parameters<PreTrainedTokenizer['apply_chat_template']>[1]} options
|
|
5420
|
+
* @returns {ReturnType<PreTrainedTokenizer['apply_chat_template']>}
|
|
5421
|
+
*/
|
|
5405
5422
|
apply_chat_template(messages, options = {}) {
|
|
5406
5423
|
if (!this.tokenizer) {
|
|
5407
5424
|
throw new Error('Unable to apply chat template without a tokenizer.');
|
|
@@ -5412,6 +5429,10 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5412
5429
|
});
|
|
5413
5430
|
}
|
|
5414
5431
|
|
|
5432
|
+
/**
|
|
5433
|
+
* @param {Parameters<PreTrainedTokenizer['batch_decode']>} args
|
|
5434
|
+
* @returns {ReturnType<PreTrainedTokenizer['batch_decode']>}
|
|
5435
|
+
*/
|
|
5415
5436
|
batch_decode(...args) {
|
|
5416
5437
|
if (!this.tokenizer) {
|
|
5417
5438
|
throw new Error('Unable to decode without a tokenizer.');
|
|
@@ -5439,8 +5460,8 @@ class Processor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_1__.Callable
|
|
|
5439
5460
|
/**
|
|
5440
5461
|
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
5441
5462
|
*
|
|
5442
|
-
* The processor class to instantiate is selected based on the `
|
|
5443
|
-
* (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
5463
|
+
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
5464
|
+
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
5444
5465
|
*
|
|
5445
5466
|
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
5446
5467
|
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
@@ -5560,15 +5581,19 @@ function getNormalizedConfig(config) {
|
|
|
5560
5581
|
case 'florence2':
|
|
5561
5582
|
case 'llava_onevision':
|
|
5562
5583
|
case 'idefics3':
|
|
5584
|
+
// @ts-expect-error TS2339
|
|
5563
5585
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
5564
5586
|
break;
|
|
5565
5587
|
case 'moondream1':
|
|
5588
|
+
// @ts-expect-error TS2339
|
|
5566
5589
|
init_normalized_config = getNormalizedConfig(config.phi_config);
|
|
5567
5590
|
break;
|
|
5568
5591
|
case 'musicgen':
|
|
5592
|
+
// @ts-expect-error TS2339
|
|
5569
5593
|
init_normalized_config = getNormalizedConfig(config.decoder);
|
|
5570
5594
|
break;
|
|
5571
5595
|
case 'multi_modality':
|
|
5596
|
+
// @ts-expect-error TS2339
|
|
5572
5597
|
init_normalized_config = getNormalizedConfig(config.language_config);
|
|
5573
5598
|
break;
|
|
5574
5599
|
|
|
@@ -5689,6 +5714,7 @@ function getNormalizedConfig(config) {
|
|
|
5689
5714
|
break;
|
|
5690
5715
|
|
|
5691
5716
|
case 'vision-encoder-decoder':
|
|
5717
|
+
// @ts-expect-error TS2339
|
|
5692
5718
|
const decoderConfig = getNormalizedConfig(config.decoder);
|
|
5693
5719
|
|
|
5694
5720
|
const add_encoder_pkv = 'num_decoder_layers' in decoderConfig;
|
|
@@ -5932,7 +5958,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
5932
5958
|
|
|
5933
5959
|
|
|
5934
5960
|
|
|
5935
|
-
const VERSION = '3.2.
|
|
5961
|
+
const VERSION = '3.2.4';
|
|
5936
5962
|
|
|
5937
5963
|
// Check if various APIs are available (depends on environment)
|
|
5938
5964
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -8594,8 +8620,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
8594
8620
|
} else if (session_options.externalData !== undefined) {
|
|
8595
8621
|
externalDataPromises = session_options.externalData.map(async (ext) => {
|
|
8596
8622
|
// if the external data is a string, fetch the file and replace the string with its content
|
|
8623
|
+
// @ts-expect-error TS2339
|
|
8597
8624
|
if (typeof ext.data === "string") {
|
|
8625
|
+
// @ts-expect-error TS2339
|
|
8598
8626
|
const ext_buffer = await (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, ext.data, true, options);
|
|
8627
|
+
// @ts-expect-error TS2698
|
|
8599
8628
|
return { ...ext, data: ext_buffer };
|
|
8600
8629
|
}
|
|
8601
8630
|
return ext;
|
|
@@ -9843,6 +9872,7 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
9843
9872
|
if (this.config.model_type === 'musicgen') {
|
|
9844
9873
|
// Custom logic (TODO: move to Musicgen class)
|
|
9845
9874
|
decoder_input_ids = Array.from({
|
|
9875
|
+
// @ts-expect-error TS2339
|
|
9846
9876
|
length: batch_size * this.config.decoder.num_codebooks
|
|
9847
9877
|
}, () => [decoder_start_token_id]);
|
|
9848
9878
|
|
|
@@ -10172,11 +10202,13 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
10172
10202
|
async encode_image({ pixel_values }) {
|
|
10173
10203
|
// image_inputs === { pixel_values }
|
|
10174
10204
|
const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
|
|
10205
|
+
// @ts-expect-error TS2339
|
|
10175
10206
|
if (!this.config.num_image_tokens) {
|
|
10176
10207
|
console.warn(
|
|
10177
10208
|
'The number of image tokens was not set in the model configuration. ' +
|
|
10178
10209
|
`Setting it to the number of features detected by the vision encoder (${features.dims[1]}).`
|
|
10179
10210
|
)
|
|
10211
|
+
// @ts-expect-error TS2339
|
|
10180
10212
|
this.config.num_image_tokens = features.dims[1];
|
|
10181
10213
|
}
|
|
10182
10214
|
return features;
|
|
@@ -11604,6 +11636,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
11604
11636
|
|
|
11605
11637
|
if (generation_config.return_token_timestamps) {
|
|
11606
11638
|
outputs["token_timestamps"] = this._extract_token_timestamps(
|
|
11639
|
+
// @ts-expect-error TS2345
|
|
11607
11640
|
outputs,
|
|
11608
11641
|
generation_config.alignment_heads,
|
|
11609
11642
|
generation_config.num_frames,
|
|
@@ -11639,6 +11672,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
11639
11672
|
);
|
|
11640
11673
|
}
|
|
11641
11674
|
|
|
11675
|
+
// @ts-expect-error TS2339
|
|
11642
11676
|
let median_filter_width = this.config.median_filter_width;
|
|
11643
11677
|
if (median_filter_width === undefined) {
|
|
11644
11678
|
console.warn("Model config has no `median_filter_width`, using default value of 7.")
|
|
@@ -11649,6 +11683,7 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
11649
11683
|
const batch = generate_outputs.cross_attentions;
|
|
11650
11684
|
// Create a list with `decoder_layers` elements, each a tensor of shape
|
|
11651
11685
|
// (batch size, attention_heads, output length, input length).
|
|
11686
|
+
// @ts-expect-error TS2339
|
|
11652
11687
|
const cross_attentions = Array.from({ length: this.config.decoder_layers },
|
|
11653
11688
|
// Concatenate the cross attentions for each layer across sequence length dimension.
|
|
11654
11689
|
(_, i) => (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.cat)(batch.map(x => x[i]), 2)
|
|
@@ -11792,6 +11827,7 @@ class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
|
|
|
11792
11827
|
attention_mask,
|
|
11793
11828
|
}) {
|
|
11794
11829
|
|
|
11830
|
+
// @ts-expect-error TS2339
|
|
11795
11831
|
const image_token_index = this.config.image_token_index;
|
|
11796
11832
|
|
|
11797
11833
|
const idsList = input_ids.tolist();
|
|
@@ -12777,6 +12813,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
12777
12813
|
const image_nums = vision_tokens.filter(x => x == image_token_id).length;
|
|
12778
12814
|
const video_nums = vision_tokens.filter(x => x == video_token_id).length;
|
|
12779
12815
|
|
|
12816
|
+
/** @type {number[][]} */
|
|
12780
12817
|
let llm_pos_ids_list = [];
|
|
12781
12818
|
let st = 0;
|
|
12782
12819
|
let remain_images = image_nums;
|
|
@@ -12846,6 +12883,7 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
12846
12883
|
// NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
|
|
12847
12884
|
// meaning to perform concatenation along dim=1, we can do the following:
|
|
12848
12885
|
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
12886
|
+
/** @type {number[]} */
|
|
12849
12887
|
const llm_positions = new Array(num_items);
|
|
12850
12888
|
let index = 0;
|
|
12851
12889
|
for (let x = 0; x < 3; ++x) {
|
|
@@ -12886,9 +12924,10 @@ class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
|
|
|
12886
12924
|
{ length: 3 * data.length },
|
|
12887
12925
|
(_, i) => data[i % data.length]
|
|
12888
12926
|
);
|
|
12927
|
+
/** @type {bigint[]} */
|
|
12889
12928
|
const mrope_position_deltas = Array.from(
|
|
12890
12929
|
{ length: dims[0] },
|
|
12891
|
-
(_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] +
|
|
12930
|
+
(_, i) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_11__.max)(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
12892
12931
|
);
|
|
12893
12932
|
|
|
12894
12933
|
return [
|
|
@@ -13459,7 +13498,7 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
13459
13498
|
*
|
|
13460
13499
|
* **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
|
|
13461
13500
|
* ```javascript
|
|
13462
|
-
* import { DPTForDepthEstimation, AutoProcessor, RawImage,
|
|
13501
|
+
* import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
13463
13502
|
*
|
|
13464
13503
|
* // Load model and processor
|
|
13465
13504
|
* const model_id = 'Xenova/dpt-hybrid-midas';
|
|
@@ -13468,7 +13507,7 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
13468
13507
|
*
|
|
13469
13508
|
* // Load image from URL
|
|
13470
13509
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
13471
|
-
* const image = await RawImage.
|
|
13510
|
+
* const image = await RawImage.read(url);
|
|
13472
13511
|
*
|
|
13473
13512
|
* // Prepare image for the model
|
|
13474
13513
|
* const inputs = await processor(image);
|
|
@@ -13477,10 +13516,15 @@ class DPTModel extends DPTPreTrainedModel { }
|
|
|
13477
13516
|
* const { predicted_depth } = await model(inputs);
|
|
13478
13517
|
*
|
|
13479
13518
|
* // Interpolate to original size
|
|
13480
|
-
* const prediction =
|
|
13519
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
13520
|
+
* size: image.size.reverse(),
|
|
13521
|
+
* mode: 'bilinear',
|
|
13522
|
+
* })).squeeze(1);
|
|
13481
13523
|
*
|
|
13482
13524
|
* // Visualize the prediction
|
|
13483
|
-
* const
|
|
13525
|
+
* const min = prediction.min().item();
|
|
13526
|
+
* const max = prediction.max().item();
|
|
13527
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
13484
13528
|
* const depth = RawImage.fromTensor(formatted);
|
|
13485
13529
|
* // RawImage {
|
|
13486
13530
|
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
@@ -13530,11 +13574,7 @@ class GLPNPreTrainedModel extends PreTrainedModel { }
|
|
|
13530
13574
|
class GLPNModel extends GLPNPreTrainedModel { }
|
|
13531
13575
|
|
|
13532
13576
|
/**
|
|
13533
|
-
*
|
|
13534
|
-
*
|
|
13535
|
-
* **Example:** Depth estimation w/ `Xenova/glpn-kitti`.
|
|
13536
|
-
* ```javascript
|
|
13537
|
-
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate, max } from '@huggingface/transformers';
|
|
13577
|
+
* import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
|
|
13538
13578
|
*
|
|
13539
13579
|
* // Load model and processor
|
|
13540
13580
|
* const model_id = 'Xenova/glpn-kitti';
|
|
@@ -13543,7 +13583,7 @@ class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
13543
13583
|
*
|
|
13544
13584
|
* // Load image from URL
|
|
13545
13585
|
* const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
|
|
13546
|
-
* const image = await RawImage.
|
|
13586
|
+
* const image = await RawImage.read(url);
|
|
13547
13587
|
*
|
|
13548
13588
|
* // Prepare image for the model
|
|
13549
13589
|
* const inputs = await processor(image);
|
|
@@ -13552,13 +13592,18 @@ class GLPNModel extends GLPNPreTrainedModel { }
|
|
|
13552
13592
|
* const { predicted_depth } = await model(inputs);
|
|
13553
13593
|
*
|
|
13554
13594
|
* // Interpolate to original size
|
|
13555
|
-
* const prediction =
|
|
13595
|
+
* const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
|
|
13596
|
+
* size: image.size.reverse(),
|
|
13597
|
+
* mode: 'bilinear',
|
|
13598
|
+
* })).squeeze(1);
|
|
13556
13599
|
*
|
|
13557
13600
|
* // Visualize the prediction
|
|
13558
|
-
* const
|
|
13601
|
+
* const min = prediction.min().item();
|
|
13602
|
+
* const max = prediction.max().item();
|
|
13603
|
+
* const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
|
|
13559
13604
|
* const depth = RawImage.fromTensor(formatted);
|
|
13560
13605
|
* // RawImage {
|
|
13561
|
-
* // data: Uint8Array(307200) [
|
|
13606
|
+
* // data: Uint8Array(307200) [ 85, 85, 84, ... ],
|
|
13562
13607
|
* // width: 640,
|
|
13563
13608
|
* // height: 480,
|
|
13564
13609
|
* // channels: 1
|
|
@@ -14525,10 +14570,12 @@ class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
|
|
|
14525
14570
|
|
|
14526
14571
|
const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
|
|
14527
14572
|
|
|
14573
|
+
// @ts-expect-error TS2339
|
|
14528
14574
|
const r = encoder_outputs.dims[1] / this.config.reduction_factor;
|
|
14529
14575
|
const maxlen = Math.floor(r * maxlenratio);
|
|
14530
14576
|
const minlen = Math.floor(r * minlenratio);
|
|
14531
14577
|
|
|
14578
|
+
// @ts-expect-error TS2339
|
|
14532
14579
|
const num_mel_bins = this.config.num_mel_bins;
|
|
14533
14580
|
|
|
14534
14581
|
let spectrogramParts = [];
|
|
@@ -14893,11 +14940,13 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
|
|
|
14893
14940
|
*/
|
|
14894
14941
|
_apply_and_filter_by_delay_pattern_mask(outputs) {
|
|
14895
14942
|
const [bs_x_codebooks, seqLength] = outputs.dims;
|
|
14943
|
+
// @ts-expect-error TS2339
|
|
14896
14944
|
const num_codebooks = this.config.decoder.num_codebooks;
|
|
14897
14945
|
const upperBound = (seqLength - num_codebooks);
|
|
14898
14946
|
|
|
14899
14947
|
let newDataSize = 0;
|
|
14900
14948
|
for (let i = 0; i < outputs.size; ++i) {
|
|
14949
|
+
// @ts-expect-error TS2339
|
|
14901
14950
|
if (outputs.data[i] === this.config.decoder.pad_token_id) {
|
|
14902
14951
|
continue;
|
|
14903
14952
|
}
|
|
@@ -14927,7 +14976,9 @@ class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE: not Mu
|
|
|
14927
14976
|
let clonedInputIds = structuredClone(input_ids);
|
|
14928
14977
|
for (let i = 0; i < clonedInputIds.length; ++i) {
|
|
14929
14978
|
for (let j = 0; j < clonedInputIds[i].length; ++j) {
|
|
14979
|
+
// @ts-expect-error TS2339
|
|
14930
14980
|
if ((i % this.config.decoder.num_codebooks) >= j) {
|
|
14981
|
+
// @ts-expect-error TS2339
|
|
14931
14982
|
clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
|
|
14932
14983
|
}
|
|
14933
14984
|
}
|
|
@@ -15084,6 +15135,9 @@ class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
|
|
|
15084
15135
|
'past_key_values',
|
|
15085
15136
|
];
|
|
15086
15137
|
|
|
15138
|
+
/**
|
|
15139
|
+
* @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
|
|
15140
|
+
*/
|
|
15087
15141
|
constructor(...args) {
|
|
15088
15142
|
super(...args);
|
|
15089
15143
|
|
|
@@ -16052,10 +16106,17 @@ class SequenceClassifierOutput extends ModelOutput {
|
|
|
16052
16106
|
/**
|
|
16053
16107
|
* @param {Object} output The output of the model.
|
|
16054
16108
|
* @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
|
|
16109
|
+
* @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
|
|
16110
|
+
* Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
|
16055
16111
|
*/
|
|
16056
|
-
constructor({ logits }) {
|
|
16112
|
+
constructor({ logits, ...attentions }) {
|
|
16057
16113
|
super();
|
|
16058
16114
|
this.logits = logits;
|
|
16115
|
+
const attentions_list = Object.values(attentions);
|
|
16116
|
+
if (attentions_list.length > 0) {
|
|
16117
|
+
// Only set attentions if they are not empty
|
|
16118
|
+
this.attentions = attentions_list;
|
|
16119
|
+
}
|
|
16059
16120
|
}
|
|
16060
16121
|
}
|
|
16061
16122
|
|
|
@@ -16313,22 +16374,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
16313
16374
|
|
|
16314
16375
|
class AutoFeatureExtractor {
|
|
16315
16376
|
|
|
16316
|
-
/**
|
|
16317
|
-
* Instantiate one of the feature extractor classes of the library from a pretrained model.
|
|
16318
|
-
*
|
|
16319
|
-
* The processor class to instantiate is selected based on the `feature_extractor_type` property of
|
|
16320
|
-
* the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
16321
|
-
*
|
|
16322
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
16323
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
16324
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
16325
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
16326
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
16327
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
16328
|
-
*
|
|
16329
|
-
* @returns {Promise<AllFeatureExtractors.ImageProcessor>} A new instance of the Processor class.
|
|
16330
|
-
*/
|
|
16331
|
-
|
|
16332
16377
|
/** @type {typeof FeatureExtractor.from_pretrained} */
|
|
16333
16378
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
16334
16379
|
|
|
@@ -16457,22 +16502,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
16457
16502
|
*/
|
|
16458
16503
|
class AutoProcessor {
|
|
16459
16504
|
|
|
16460
|
-
/**
|
|
16461
|
-
* Instantiate one of the processor classes of the library from a pretrained model.
|
|
16462
|
-
*
|
|
16463
|
-
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
|
|
16464
|
-
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
|
|
16465
|
-
*
|
|
16466
|
-
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
|
|
16467
|
-
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
|
|
16468
|
-
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
|
|
16469
|
-
* user or organization name, like `dbmdz/bert-base-german-cased`.
|
|
16470
|
-
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
|
|
16471
|
-
* @param {import('../../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
|
|
16472
|
-
*
|
|
16473
|
-
* @returns {Promise<Processor>} A new instance of the Processor class.
|
|
16474
|
-
*/
|
|
16475
|
-
|
|
16476
16505
|
/** @type {typeof Processor.from_pretrained} */
|
|
16477
16506
|
static async from_pretrained(pretrained_model_name_or_path, options={}) {
|
|
16478
16507
|
|
|
@@ -16796,6 +16825,7 @@ class ConvNextImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
|
|
|
16796
16825
|
/**
|
|
16797
16826
|
* Percentage of the image to crop. Only has an effect if this.size < 384.
|
|
16798
16827
|
*/
|
|
16828
|
+
// @ts-expect-error TS2339
|
|
16799
16829
|
this.crop_pct = this.config.crop_pct ?? (224 / 256);
|
|
16800
16830
|
}
|
|
16801
16831
|
|
|
@@ -17003,6 +17033,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
17003
17033
|
class EfficientNetImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
|
|
17004
17034
|
constructor(config) {
|
|
17005
17035
|
super(config);
|
|
17036
|
+
// @ts-expect-error TS2339
|
|
17006
17037
|
this.include_top = this.config.include_top ?? true;
|
|
17007
17038
|
if (this.include_top) {
|
|
17008
17039
|
this.image_std = this.image_std.map(x => x * x);
|
|
@@ -17086,8 +17117,11 @@ class Florence2Processor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
17086
17117
|
super(config, components);
|
|
17087
17118
|
|
|
17088
17119
|
const {
|
|
17120
|
+
// @ts-expect-error TS2339
|
|
17089
17121
|
tasks_answer_post_processing_type,
|
|
17122
|
+
// @ts-expect-error TS2339
|
|
17090
17123
|
task_prompts_without_inputs,
|
|
17124
|
+
// @ts-expect-error TS2339
|
|
17091
17125
|
task_prompts_with_input,
|
|
17092
17126
|
} = this.image_processor.config;
|
|
17093
17127
|
|
|
@@ -17384,6 +17418,8 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
|
|
|
17384
17418
|
|
|
17385
17419
|
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
17386
17420
|
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
17421
|
+
|
|
17422
|
+
// @ts-expect-error
|
|
17387
17423
|
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
17388
17424
|
}
|
|
17389
17425
|
}
|
|
@@ -17793,6 +17829,7 @@ class VLMImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTE
|
|
|
17793
17829
|
},
|
|
17794
17830
|
...config,
|
|
17795
17831
|
});
|
|
17832
|
+
// @ts-expect-error TS2339
|
|
17796
17833
|
this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
|
|
17797
17834
|
}
|
|
17798
17835
|
|
|
@@ -18241,6 +18278,8 @@ class MgpstrProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE
|
|
|
18241
18278
|
* - bpe_preds: The list of BPE decoded sentences.
|
|
18242
18279
|
* - wp_preds: The list of wp decoded sentences.
|
|
18243
18280
|
*/
|
|
18281
|
+
// @ts-expect-error The type of this method is not compatible with the one
|
|
18282
|
+
// in the base class. It might be a good idea to fix this.
|
|
18244
18283
|
batch_decode([char_logits, bpe_logits, wp_logits]) {
|
|
18245
18284
|
const [char_preds, char_scores] = this._decode_helper(char_logits, 'char');
|
|
18246
18285
|
const [bpe_preds, bpe_scores] = this._decode_helper(bpe_logits, 'bpe');
|
|
@@ -18634,6 +18673,7 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
18634
18673
|
}
|
|
18635
18674
|
|
|
18636
18675
|
const bos_token = this.tokenizer.bos_token;
|
|
18676
|
+
// @ts-expect-error TS2339
|
|
18637
18677
|
const image_seq_length = this.image_processor.config.image_seq_length;
|
|
18638
18678
|
let input_strings;
|
|
18639
18679
|
if (text.some((t) => t.includes(IMAGE_TOKEN))) {
|
|
@@ -18886,7 +18926,7 @@ class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_
|
|
|
18886
18926
|
*
|
|
18887
18927
|
* @param {string|string[]} text
|
|
18888
18928
|
* @param {RawImage|RawImage[]} images
|
|
18889
|
-
* @param {
|
|
18929
|
+
* @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
|
|
18890
18930
|
* @returns {Promise<any>}
|
|
18891
18931
|
*/
|
|
18892
18932
|
async _call(text, images = null, {
|
|
@@ -19073,6 +19113,7 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
|
|
|
19073
19113
|
|
|
19074
19114
|
let current_speaker = -1;
|
|
19075
19115
|
for (let i = 0; i < scores.length; ++i) {
|
|
19116
|
+
/** @type {number[]} */
|
|
19076
19117
|
const probabilities = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.softmax)(scores[i]);
|
|
19077
19118
|
const [score, id] = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_2__.max)(probabilities);
|
|
19078
19119
|
const [start, end] = [i, i + 1];
|
|
@@ -19260,6 +19301,7 @@ class Qwen2VLProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODUL
|
|
|
19260
19301
|
}
|
|
19261
19302
|
|
|
19262
19303
|
if (image_grid_thw) {
|
|
19304
|
+
// @ts-expect-error TS2551
|
|
19263
19305
|
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
19264
19306
|
let index = 0;
|
|
19265
19307
|
|
|
@@ -19751,8 +19793,8 @@ class SeamlessM4TFeatureExtractor extends _base_feature_extraction_utils_js__WEB
|
|
|
19751
19793
|
'int64',
|
|
19752
19794
|
new BigInt64Array(numPaddedFrames),
|
|
19753
19795
|
[1, numPaddedFrames],
|
|
19754
|
-
)
|
|
19755
|
-
padded_attention_mask.data.fill(1n, 0, num_frames);
|
|
19796
|
+
);
|
|
19797
|
+
/** @type {BigInt64Array} */ (padded_attention_mask.data).fill(1n, 0, num_frames);
|
|
19756
19798
|
}
|
|
19757
19799
|
}
|
|
19758
19800
|
}
|
|
@@ -20565,7 +20607,7 @@ class WhisperFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK
|
|
|
20565
20607
|
)
|
|
20566
20608
|
|
|
20567
20609
|
const data = features.data;
|
|
20568
|
-
const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(data)[0];
|
|
20610
|
+
const maxValue = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.max)(/** @type {Float32Array} */(data))[0];
|
|
20569
20611
|
|
|
20570
20612
|
for (let i = 0; i < data.length; ++i) {
|
|
20571
20613
|
data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
|
|
@@ -20828,6 +20870,16 @@ class TensorOpRegistry {
|
|
|
20828
20870
|
// executionProviders: ['webgpu'],
|
|
20829
20871
|
};
|
|
20830
20872
|
|
|
20873
|
+
static get nearest_interpolate_4d() {
|
|
20874
|
+
if (!this._nearest_interpolate_4d) {
|
|
20875
|
+
this._nearest_interpolate_4d = wrap(
|
|
20876
|
+
[8, 10, 18, 0, 58, 129, 1, 10, 41, 10, 1, 120, 10, 0, 10, 0, 10, 1, 115, 18, 1, 121, 34, 6, 82, 101, 115, 105, 122, 101, 42, 18, 10, 4, 109, 111, 100, 101, 34, 7, 110, 101, 97, 114, 101, 115, 116, 160, 1, 3, 18, 1, 114, 90, 31, 10, 1, 120, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 90, 15, 10, 1, 115, 18, 10, 10, 8, 8, 7, 18, 4, 10, 2, 8, 4, 98, 31, 10, 1, 121, 18, 26, 10, 24, 8, 1, 18, 20, 10, 3, 18, 1, 98, 10, 3, 18, 1, 99, 10, 3, 18, 1, 104, 10, 3, 18, 1, 119, 66, 2, 16, 21],
|
|
20877
|
+
this.session_options,
|
|
20878
|
+
'y',
|
|
20879
|
+
);
|
|
20880
|
+
}
|
|
20881
|
+
return this._nearest_interpolate_4d;
|
|
20882
|
+
}
|
|
20831
20883
|
static get bilinear_interpolate_4d() {
|
|
20832
20884
|
if (!this._bilinear_interpolate_4d) {
|
|
20833
20885
|
this._bilinear_interpolate_4d = wrap(
|
|
@@ -21202,6 +21254,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
|
|
|
21202
21254
|
|
|
21203
21255
|
// TODO: Use softmax tensor function
|
|
21204
21256
|
const function_to_apply =
|
|
21257
|
+
// @ts-expect-error TS2339
|
|
21205
21258
|
this.model.config.problem_type === 'multi_label_classification'
|
|
21206
21259
|
? batch => batch.sigmoid()
|
|
21207
21260
|
: batch => new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor(
|
|
@@ -21210,6 +21263,7 @@ class TextClassificationPipeline extends (/** @type {new (options: TextPipelineC
|
|
|
21210
21263
|
batch.dims,
|
|
21211
21264
|
); // single_label_classification (default)
|
|
21212
21265
|
|
|
21266
|
+
// @ts-expect-error TS2339
|
|
21213
21267
|
const id2label = this.model.config.id2label;
|
|
21214
21268
|
|
|
21215
21269
|
const toReturn = [];
|
|
@@ -21312,6 +21366,7 @@ class TokenClassificationPipeline extends (/** @type {new (options: TextPipeline
|
|
|
21312
21366
|
const outputs = await this.model(model_inputs)
|
|
21313
21367
|
|
|
21314
21368
|
const logits = outputs.logits;
|
|
21369
|
+
// @ts-expect-error TS2339
|
|
21315
21370
|
const id2label = this.model.config.id2label;
|
|
21316
21371
|
|
|
21317
21372
|
const toReturn = [];
|
|
@@ -21651,11 +21706,14 @@ class Text2TextGenerationPipeline extends (/** @type {new (options: TextPipeline
|
|
|
21651
21706
|
|
|
21652
21707
|
|
|
21653
21708
|
// Add global prefix, if present
|
|
21709
|
+
// @ts-expect-error TS2339
|
|
21654
21710
|
if (this.model.config.prefix) {
|
|
21711
|
+
// @ts-expect-error TS2339
|
|
21655
21712
|
texts = texts.map(x => this.model.config.prefix + x)
|
|
21656
21713
|
}
|
|
21657
21714
|
|
|
21658
21715
|
// Handle task specific params:
|
|
21716
|
+
// @ts-expect-error TS2339
|
|
21659
21717
|
const task_specific_params = this.model.config.task_specific_params
|
|
21660
21718
|
if (task_specific_params && task_specific_params[this.task]) {
|
|
21661
21719
|
// Add prefixes, if present
|
|
@@ -22394,6 +22452,7 @@ class AudioClassificationPipeline extends (/** @type {new (options: AudioPipelin
|
|
|
22394
22452
|
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
22395
22453
|
const preparedAudios = await prepareAudios(audio, sampling_rate);
|
|
22396
22454
|
|
|
22455
|
+
// @ts-expect-error TS2339
|
|
22397
22456
|
const id2label = this.model.config.id2label;
|
|
22398
22457
|
|
|
22399
22458
|
const toReturn = [];
|
|
@@ -22704,6 +22763,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22704
22763
|
audio = [/** @type {AudioInput} */ (audio)];
|
|
22705
22764
|
}
|
|
22706
22765
|
|
|
22766
|
+
// @ts-expect-error TS2339
|
|
22707
22767
|
const time_precision = this.processor.feature_extractor.config.chunk_length / this.model.config.max_source_positions;
|
|
22708
22768
|
const hop_length = this.processor.feature_extractor.config.hop_length;
|
|
22709
22769
|
|
|
@@ -22769,7 +22829,9 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22769
22829
|
|
|
22770
22830
|
// TODO: Right now we only get top beam
|
|
22771
22831
|
if (return_timestamps === 'word') {
|
|
22832
|
+
// @ts-expect-error TS2339
|
|
22772
22833
|
chunk.tokens = data.sequences.tolist()[0];
|
|
22834
|
+
// @ts-expect-error TS2339
|
|
22773
22835
|
chunk.token_timestamps = data.token_timestamps.tolist()[0].map(
|
|
22774
22836
|
(/** @type {number} */ x) => (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_6__.round)(x, 2)
|
|
22775
22837
|
);
|
|
@@ -22814,7 +22876,7 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22814
22876
|
const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
|
|
22815
22877
|
const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
|
|
22816
22878
|
|
|
22817
|
-
const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
|
|
22879
|
+
const text = this.processor.batch_decode(/** @type {Tensor} */(outputs), { skip_special_tokens: true })[0];
|
|
22818
22880
|
toReturn.push({ text });
|
|
22819
22881
|
}
|
|
22820
22882
|
return single ? toReturn[0] : toReturn;
|
|
@@ -22963,6 +23025,7 @@ class ImageClassificationPipeline extends (/** @type {new (options: ImagePipelin
|
|
|
22963
23025
|
const { pixel_values } = await this.processor(preparedImages);
|
|
22964
23026
|
const output = await this.model({ pixel_values });
|
|
22965
23027
|
|
|
23028
|
+
// @ts-expect-error TS2339
|
|
22966
23029
|
const id2label = this.model.config.id2label;
|
|
22967
23030
|
|
|
22968
23031
|
/** @type {ImageClassificationOutput[]} */
|
|
@@ -23077,6 +23140,7 @@ class ImageSegmentationPipeline extends (/** @type {new (options: ImagePipelineC
|
|
|
23077
23140
|
}
|
|
23078
23141
|
}
|
|
23079
23142
|
|
|
23143
|
+
// @ts-expect-error TS2339
|
|
23080
23144
|
const id2label = this.model.config.id2label;
|
|
23081
23145
|
|
|
23082
23146
|
/** @type {ImageSegmentationPipelineOutput[]} */
|
|
@@ -23303,6 +23367,7 @@ class ObjectDetectionPipeline extends (/** @type {new (options: ImagePipelineCon
|
|
|
23303
23367
|
const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSizes);
|
|
23304
23368
|
|
|
23305
23369
|
// Add labels
|
|
23370
|
+
// @ts-expect-error TS2339
|
|
23306
23371
|
const id2label = this.model.config.id2label;
|
|
23307
23372
|
|
|
23308
23373
|
// Format output
|
|
@@ -23522,6 +23587,7 @@ class DocumentQuestionAnsweringPipeline extends (/** @type {new (options: TextIm
|
|
|
23522
23587
|
// Run model
|
|
23523
23588
|
const output = await this.model.generate({
|
|
23524
23589
|
inputs: pixel_values,
|
|
23590
|
+
// @ts-expect-error TS2339
|
|
23525
23591
|
max_length: this.model.config.decoder.max_position_embeddings,
|
|
23526
23592
|
decoder_input_ids,
|
|
23527
23593
|
...generate_kwargs,
|
|
@@ -23637,6 +23703,7 @@ class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPipelineC
|
|
|
23637
23703
|
// Generate waveform
|
|
23638
23704
|
const { waveform } = await this.model(inputs);
|
|
23639
23705
|
|
|
23706
|
+
// @ts-expect-error TS2339
|
|
23640
23707
|
const sampling_rate = this.model.config.sampling_rate;
|
|
23641
23708
|
return {
|
|
23642
23709
|
audio: waveform.data,
|
|
@@ -23794,11 +23861,23 @@ class DepthEstimationPipeline extends (/** @type {new (options: ImagePipelineCon
|
|
|
23794
23861
|
|
|
23795
23862
|
const toReturn = [];
|
|
23796
23863
|
for (let i = 0; i < preparedImages.length; ++i) {
|
|
23797
|
-
const
|
|
23798
|
-
const
|
|
23864
|
+
const batch = predicted_depth[i];
|
|
23865
|
+
const [height, width] = batch.dims.slice(-2);
|
|
23866
|
+
const [new_width, new_height] = preparedImages[i].size;
|
|
23867
|
+
|
|
23868
|
+
// Interpolate to original size
|
|
23869
|
+
const prediction = (await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.interpolate_4d)(batch.view(1, 1, height, width), {
|
|
23870
|
+
size: [new_height, new_width],
|
|
23871
|
+
mode: 'bilinear',
|
|
23872
|
+
})).view(new_height, new_width);
|
|
23873
|
+
|
|
23874
|
+
const minval = /** @type {number} */(prediction.min().item());
|
|
23875
|
+
const maxval = /** @type {number} */(prediction.max().item());
|
|
23876
|
+
const formatted = prediction.sub(minval).div_(maxval - minval).mul_(255).to('uint8').unsqueeze(0);
|
|
23877
|
+
const depth = _utils_image_js__WEBPACK_IMPORTED_MODULE_9__.RawImage.fromTensor(formatted);
|
|
23799
23878
|
toReturn.push({
|
|
23800
|
-
predicted_depth:
|
|
23801
|
-
depth
|
|
23879
|
+
predicted_depth: prediction,
|
|
23880
|
+
depth,
|
|
23802
23881
|
});
|
|
23803
23882
|
}
|
|
23804
23883
|
|
|
@@ -24278,6 +24357,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
|
|
|
24278
24357
|
return result;
|
|
24279
24358
|
}
|
|
24280
24359
|
|
|
24360
|
+
|
|
24281
24361
|
/***/ }),
|
|
24282
24362
|
|
|
24283
24363
|
/***/ "./src/tokenizers.js":
|
|
@@ -24347,7 +24427,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
24347
24427
|
/* harmony import */ var _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./utils/data-structures.js */ "./src/utils/data-structures.js");
|
|
24348
24428
|
/* harmony import */ var _huggingface_jinja__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! @huggingface/jinja */ "./node_modules/@huggingface/jinja/dist/index.js");
|
|
24349
24429
|
/* harmony import */ var _models_whisper_common_whisper_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./models/whisper/common_whisper.js */ "./src/models/whisper/common_whisper.js");
|
|
24350
|
-
/* harmony import */ var _utils_constants_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./utils/constants.js */ "./src/utils/constants.js");
|
|
24351
24430
|
|
|
24352
24431
|
/**
|
|
24353
24432
|
* @file Tokenizers are used to prepare textual inputs for a model.
|
|
@@ -24384,7 +24463,6 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
24384
24463
|
|
|
24385
24464
|
|
|
24386
24465
|
|
|
24387
|
-
|
|
24388
24466
|
/**
|
|
24389
24467
|
* @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
|
|
24390
24468
|
* @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used.
|
|
@@ -24868,7 +24946,7 @@ class Unigram extends TokenizerModel {
|
|
|
24868
24946
|
* Create a new Unigram tokenizer model.
|
|
24869
24947
|
* @param {Object} config The configuration object for the Unigram model.
|
|
24870
24948
|
* @param {number} config.unk_id The ID of the unknown token
|
|
24871
|
-
* @param {
|
|
24949
|
+
* @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores.
|
|
24872
24950
|
* @param {Object} moreConfig Additional configuration object for the Unigram model.
|
|
24873
24951
|
*/
|
|
24874
24952
|
constructor(config, moreConfig) {
|
|
@@ -24876,11 +24954,10 @@ class Unigram extends TokenizerModel {
|
|
|
24876
24954
|
|
|
24877
24955
|
const vocabSize = config.vocab.length;
|
|
24878
24956
|
this.vocab = new Array(vocabSize);
|
|
24957
|
+
/** @type {number[]} */
|
|
24879
24958
|
this.scores = new Array(vocabSize);
|
|
24880
24959
|
for (let i = 0; i < vocabSize; ++i) {
|
|
24881
|
-
|
|
24882
|
-
this.vocab[i] = piece[0];
|
|
24883
|
-
this.scores[i] = piece[1];
|
|
24960
|
+
[this.vocab[i], this.scores[i]] = config.vocab[i];
|
|
24884
24961
|
}
|
|
24885
24962
|
|
|
24886
24963
|
this.unk_token_id = config.unk_id;
|
|
@@ -30243,6 +30320,8 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
30243
30320
|
/* harmony export */ });
|
|
30244
30321
|
/* harmony import */ var _env_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../env.js */ "./src/env.js");
|
|
30245
30322
|
/* harmony import */ var _devices_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./devices.js */ "./src/utils/devices.js");
|
|
30323
|
+
/// <reference types="@webgpu/types" />
|
|
30324
|
+
|
|
30246
30325
|
|
|
30247
30326
|
|
|
30248
30327
|
|
|
@@ -30498,7 +30577,7 @@ class FileResponse {
|
|
|
30498
30577
|
*/
|
|
30499
30578
|
async arrayBuffer() {
|
|
30500
30579
|
const data = await fs__WEBPACK_IMPORTED_MODULE_0__.promises.readFile(this.filePath);
|
|
30501
|
-
return data.buffer;
|
|
30580
|
+
return /** @type {ArrayBuffer} */ (data.buffer);
|
|
30502
30581
|
}
|
|
30503
30582
|
|
|
30504
30583
|
/**
|
|
@@ -32159,8 +32238,9 @@ function magnitude(arr) {
|
|
|
32159
32238
|
|
|
32160
32239
|
/**
|
|
32161
32240
|
* Returns the value and index of the minimum element in an array.
|
|
32162
|
-
* @
|
|
32163
|
-
* @
|
|
32241
|
+
* @template {number[]|bigint[]|AnyTypedArray} T
|
|
32242
|
+
* @param {T} arr array of numbers.
|
|
32243
|
+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the minimum element, of the form: [valueOfMin, indexOfMin]
|
|
32164
32244
|
* @throws {Error} If array is empty.
|
|
32165
32245
|
*/
|
|
32166
32246
|
function min(arr) {
|
|
@@ -32173,14 +32253,15 @@ function min(arr) {
|
|
|
32173
32253
|
indexOfMin = i;
|
|
32174
32254
|
}
|
|
32175
32255
|
}
|
|
32176
|
-
return [min, indexOfMin];
|
|
32256
|
+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([min, indexOfMin]);
|
|
32177
32257
|
}
|
|
32178
32258
|
|
|
32179
32259
|
|
|
32180
32260
|
/**
|
|
32181
32261
|
* Returns the value and index of the maximum element in an array.
|
|
32182
|
-
* @
|
|
32183
|
-
* @
|
|
32262
|
+
* @template {number[]|bigint[]|AnyTypedArray} T
|
|
32263
|
+
* @param {T} arr array of numbers.
|
|
32264
|
+
* @returns {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} the value and index of the maximum element, of the form: [valueOfMax, indexOfMax]
|
|
32184
32265
|
* @throws {Error} If array is empty.
|
|
32185
32266
|
*/
|
|
32186
32267
|
function max(arr) {
|
|
@@ -32193,7 +32274,7 @@ function max(arr) {
|
|
|
32193
32274
|
indexOfMax = i;
|
|
32194
32275
|
}
|
|
32195
32276
|
}
|
|
32196
|
-
return [
|
|
32277
|
+
return /** @type {T extends bigint[]|BigTypedArray ? [bigint, number] : [number, number]} */([max, indexOfMax]);
|
|
32197
32278
|
}
|
|
32198
32279
|
|
|
32199
32280
|
function isPowerOfTwo(number) {
|
|
@@ -33491,8 +33572,6 @@ class Tensor {
|
|
|
33491
33572
|
return this.permute(...dims);
|
|
33492
33573
|
}
|
|
33493
33574
|
|
|
33494
|
-
// TODO add .max() and .min() methods
|
|
33495
|
-
|
|
33496
33575
|
/**
|
|
33497
33576
|
* Returns the sum of each row of the input tensor in the given dimension dim.
|
|
33498
33577
|
*
|
|
@@ -33786,6 +33865,36 @@ class Tensor {
|
|
|
33786
33865
|
return mean(this, dim, keepdim);
|
|
33787
33866
|
}
|
|
33788
33867
|
|
|
33868
|
+
min(dim = null, keepdim = false) {
|
|
33869
|
+
if (dim !== null) {
|
|
33870
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33871
|
+
}
|
|
33872
|
+
const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[0];
|
|
33873
|
+
return new Tensor(this.type, [value], []);
|
|
33874
|
+
}
|
|
33875
|
+
max(dim = null, keepdim = false) {
|
|
33876
|
+
if (dim !== null) {
|
|
33877
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33878
|
+
}
|
|
33879
|
+
const value = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[0];
|
|
33880
|
+
return new Tensor(this.type, [value], []);
|
|
33881
|
+
}
|
|
33882
|
+
|
|
33883
|
+
argmin(dim = null, keepdim = false) {
|
|
33884
|
+
if (dim !== null) {
|
|
33885
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33886
|
+
}
|
|
33887
|
+
const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.min)(this.data)[1];
|
|
33888
|
+
return new Tensor('int64', [BigInt(index)], []);
|
|
33889
|
+
}
|
|
33890
|
+
argmax(dim = null, keepdim = false) {
|
|
33891
|
+
if (dim !== null) {
|
|
33892
|
+
throw new Error("`dim !== null` not yet implemented.");
|
|
33893
|
+
}
|
|
33894
|
+
const index = (0,_maths_js__WEBPACK_IMPORTED_MODULE_0__.max)(this.data)[1];
|
|
33895
|
+
return new Tensor('int64', [BigInt(index)], []);
|
|
33896
|
+
}
|
|
33897
|
+
|
|
33789
33898
|
/**
|
|
33790
33899
|
* Performs Tensor dtype conversion.
|
|
33791
33900
|
* @param {DataType} type The desired data type.
|
|
@@ -33919,7 +34028,7 @@ function interpolate(input, [out_height, out_width], mode = 'bilinear', align_co
|
|
|
33919
34028
|
* @param {Tensor} input the input tensor
|
|
33920
34029
|
* @param {Object} options the options for the interpolation
|
|
33921
34030
|
* @param {[number, number]|[number, number, number]|[number, number, number, number]} [options.size=null] output spatial size.
|
|
33922
|
-
* @param {"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
|
|
34031
|
+
* @param {"nearest"|"bilinear"|"bicubic"} [options.mode='bilinear'] algorithm used for upsampling
|
|
33923
34032
|
* @returns {Promise<Tensor>} The interpolated tensor.
|
|
33924
34033
|
*/
|
|
33925
34034
|
async function interpolate_4d(input, {
|
|
@@ -33949,7 +34058,9 @@ async function interpolate_4d(input, {
|
|
|
33949
34058
|
}
|
|
33950
34059
|
|
|
33951
34060
|
let op;
|
|
33952
|
-
if (mode === '
|
|
34061
|
+
if (mode === 'nearest') {
|
|
34062
|
+
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.nearest_interpolate_4d;
|
|
34063
|
+
} else if (mode === 'bilinear') {
|
|
33953
34064
|
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bilinear_interpolate_4d;
|
|
33954
34065
|
} else if (mode === 'bicubic') {
|
|
33955
34066
|
op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.bicubic_interpolate_4d;
|
|
@@ -33990,13 +34101,13 @@ async function rfft(x, a) {
|
|
|
33990
34101
|
* Returns the k largest elements of the given input tensor.
|
|
33991
34102
|
* Inspired by https://pytorch.org/docs/stable/generated/torch.topk.html
|
|
33992
34103
|
* @param {Tensor} x the input tensor
|
|
33993
|
-
* @param {number} k the k in "top-k"
|
|
34104
|
+
* @param {number} [k] the k in "top-k"
|
|
33994
34105
|
* @returns {Promise<[Tensor, Tensor]>} the output tuple of (Tensor, LongTensor) of top-k elements and their indices.
|
|
33995
34106
|
*/
|
|
33996
34107
|
async function topk(x, k) {
|
|
33997
34108
|
const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.top_k;
|
|
33998
34109
|
|
|
33999
|
-
if (k
|
|
34110
|
+
if (k == null) {
|
|
34000
34111
|
k = x.dims.at(-1);
|
|
34001
34112
|
} else {
|
|
34002
34113
|
k = Math.min(k, x.dims.at(-1));
|
|
@@ -34025,10 +34136,10 @@ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length])
|
|
|
34025
34136
|
async function slice(data, starts, ends, axes, steps) {
|
|
34026
34137
|
const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
|
|
34027
34138
|
return await op({
|
|
34028
|
-
x: data,
|
|
34029
|
-
s: arrayToIndexTensor(starts),
|
|
34030
|
-
e: arrayToIndexTensor(ends),
|
|
34031
|
-
a: arrayToIndexTensor(axes),
|
|
34139
|
+
x: data,
|
|
34140
|
+
s: arrayToIndexTensor(starts),
|
|
34141
|
+
e: arrayToIndexTensor(ends),
|
|
34142
|
+
a: arrayToIndexTensor(axes),
|
|
34032
34143
|
t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
|
|
34033
34144
|
});
|
|
34034
34145
|
}
|