@huggingface/transformers 3.1.2 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -3
- package/dist/transformers.cjs +835 -144
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +850 -144
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +850 -144
- package/dist/transformers.mjs.map +1 -1
- package/package.json +1 -1
- package/src/base/image_processors_utils.js +3 -1
- package/src/configs.js +10 -2
- package/src/env.js +1 -1
- package/src/models/feature_extractors.js +1 -0
- package/src/models/idefics3/image_processing_idefics3.js +24 -13
- package/src/models/image_processors.js +1 -0
- package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
- package/src/models/moonshine/processing_moonshine.js +20 -0
- package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
- package/src/models/phi3_v/processing_phi3_v.js +53 -0
- package/src/models/processors.js +2 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
- package/src/models/pyannote/processing_pyannote.js +7 -54
- package/src/models.js +223 -30
- package/src/ops/registry.js +11 -0
- package/src/pipelines.js +31 -1
- package/src/utils/tensor.js +51 -1
- package/types/base/image_processors_utils.d.ts +2 -2
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/feature_extractors.d.ts +1 -0
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
- package/types/models/moonshine/processing_moonshine.d.ts +17 -0
- package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/pyannote/processing_pyannote.d.ts +4 -15
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
- package/types/models.d.ts +64 -1
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +5 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +16 -0
- package/types/utils/tensor.d.ts.map +1 -1
package/dist/transformers.cjs
CHANGED
|
@@ -4920,7 +4920,7 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
4920
4920
|
* Pad the image by a certain amount.
|
|
4921
4921
|
* @param {Float32Array} pixelData The pixel data to pad.
|
|
4922
4922
|
* @param {number[]} imgDims The dimensions of the image (height, width, channels).
|
|
4923
|
-
* @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
|
|
4923
|
+
* @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
|
|
4924
4924
|
* @param {Object} options The options for padding.
|
|
4925
4925
|
* @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
|
|
4926
4926
|
* @param {boolean} [options.center=false] Whether to center the image.
|
|
@@ -4938,6 +4938,8 @@ class ImageProcessor extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_0__.Call
|
|
|
4938
4938
|
if (typeof padSize === 'number') {
|
|
4939
4939
|
paddedImageWidth = padSize;
|
|
4940
4940
|
paddedImageHeight = padSize;
|
|
4941
|
+
} else if (padSize === 'square') {
|
|
4942
|
+
paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
|
|
4941
4943
|
} else {
|
|
4942
4944
|
paddedImageWidth = padSize.width;
|
|
4943
4945
|
paddedImageHeight = padSize.height;
|
|
@@ -5583,8 +5585,6 @@ function getNormalizedConfig(config) {
|
|
|
5583
5585
|
case 'gpt_neox':
|
|
5584
5586
|
case 'stablelm':
|
|
5585
5587
|
case 'opt':
|
|
5586
|
-
case 'phi':
|
|
5587
|
-
case 'phi3':
|
|
5588
5588
|
case 'falcon':
|
|
5589
5589
|
mapping['num_heads'] = 'num_attention_heads';
|
|
5590
5590
|
mapping['num_layers'] = 'num_hidden_layers';
|
|
@@ -5600,6 +5600,9 @@ function getNormalizedConfig(config) {
|
|
|
5600
5600
|
case 'starcoder2':
|
|
5601
5601
|
case 'qwen2':
|
|
5602
5602
|
case 'qwen2_vl':
|
|
5603
|
+
case 'phi':
|
|
5604
|
+
case 'phi3':
|
|
5605
|
+
case 'phi3_v':
|
|
5603
5606
|
mapping['num_heads'] = 'num_key_value_heads';
|
|
5604
5607
|
mapping['num_layers'] = 'num_hidden_layers';
|
|
5605
5608
|
mapping['hidden_size'] = 'hidden_size';
|
|
@@ -5632,6 +5635,12 @@ function getNormalizedConfig(config) {
|
|
|
5632
5635
|
mapping['num_layers'] = 'n_layers';
|
|
5633
5636
|
mapping['hidden_size'] = 'd_model';
|
|
5634
5637
|
break;
|
|
5638
|
+
case 'exaone':
|
|
5639
|
+
mapping['num_heads'] = 'num_key_value_heads';
|
|
5640
|
+
mapping['num_layers'] = 'num_layers';
|
|
5641
|
+
mapping['dim_kv'] = 'head_dim';
|
|
5642
|
+
mapping['num_attention_heads'] = 'num_attention_heads';
|
|
5643
|
+
break;
|
|
5635
5644
|
|
|
5636
5645
|
// Encoder-decoder models
|
|
5637
5646
|
case 't5':
|
|
@@ -5673,6 +5682,7 @@ function getNormalizedConfig(config) {
|
|
|
5673
5682
|
mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model';
|
|
5674
5683
|
break;
|
|
5675
5684
|
case 'musicgen_decoder':
|
|
5685
|
+
case 'moonshine':
|
|
5676
5686
|
mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers';
|
|
5677
5687
|
mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads';
|
|
5678
5688
|
mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size';
|
|
@@ -5922,7 +5932,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
5922
5932
|
|
|
5923
5933
|
|
|
5924
5934
|
|
|
5925
|
-
const VERSION = '3.1
|
|
5935
|
+
const VERSION = '3.2.1';
|
|
5926
5936
|
|
|
5927
5937
|
// Check if various APIs are available (depends on environment)
|
|
5928
5938
|
const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
@@ -8020,6 +8030,9 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
8020
8030
|
/* harmony export */ EsmForTokenClassification: () => (/* binding */ EsmForTokenClassification),
|
|
8021
8031
|
/* harmony export */ EsmModel: () => (/* binding */ EsmModel),
|
|
8022
8032
|
/* harmony export */ EsmPreTrainedModel: () => (/* binding */ EsmPreTrainedModel),
|
|
8033
|
+
/* harmony export */ ExaoneForCausalLM: () => (/* binding */ ExaoneForCausalLM),
|
|
8034
|
+
/* harmony export */ ExaoneModel: () => (/* binding */ ExaoneModel),
|
|
8035
|
+
/* harmony export */ ExaonePreTrainedModel: () => (/* binding */ ExaonePreTrainedModel),
|
|
8023
8036
|
/* harmony export */ FalconForCausalLM: () => (/* binding */ FalconForCausalLM),
|
|
8024
8037
|
/* harmony export */ FalconModel: () => (/* binding */ FalconModel),
|
|
8025
8038
|
/* harmony export */ FalconPreTrainedModel: () => (/* binding */ FalconPreTrainedModel),
|
|
@@ -8143,7 +8156,15 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
8143
8156
|
/* harmony export */ MobileViTV2Model: () => (/* binding */ MobileViTV2Model),
|
|
8144
8157
|
/* harmony export */ MobileViTV2PreTrainedModel: () => (/* binding */ MobileViTV2PreTrainedModel),
|
|
8145
8158
|
/* harmony export */ ModelOutput: () => (/* binding */ ModelOutput),
|
|
8159
|
+
/* harmony export */ ModernBertForMaskedLM: () => (/* binding */ ModernBertForMaskedLM),
|
|
8160
|
+
/* harmony export */ ModernBertForSequenceClassification: () => (/* binding */ ModernBertForSequenceClassification),
|
|
8161
|
+
/* harmony export */ ModernBertForTokenClassification: () => (/* binding */ ModernBertForTokenClassification),
|
|
8162
|
+
/* harmony export */ ModernBertModel: () => (/* binding */ ModernBertModel),
|
|
8163
|
+
/* harmony export */ ModernBertPreTrainedModel: () => (/* binding */ ModernBertPreTrainedModel),
|
|
8146
8164
|
/* harmony export */ Moondream1ForConditionalGeneration: () => (/* binding */ Moondream1ForConditionalGeneration),
|
|
8165
|
+
/* harmony export */ MoonshineForConditionalGeneration: () => (/* binding */ MoonshineForConditionalGeneration),
|
|
8166
|
+
/* harmony export */ MoonshineModel: () => (/* binding */ MoonshineModel),
|
|
8167
|
+
/* harmony export */ MoonshinePreTrainedModel: () => (/* binding */ MoonshinePreTrainedModel),
|
|
8147
8168
|
/* harmony export */ MptForCausalLM: () => (/* binding */ MptForCausalLM),
|
|
8148
8169
|
/* harmony export */ MptModel: () => (/* binding */ MptModel),
|
|
8149
8170
|
/* harmony export */ MptPreTrainedModel: () => (/* binding */ MptPreTrainedModel),
|
|
@@ -8184,6 +8205,8 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
8184
8205
|
/* harmony export */ Phi3ForCausalLM: () => (/* binding */ Phi3ForCausalLM),
|
|
8185
8206
|
/* harmony export */ Phi3Model: () => (/* binding */ Phi3Model),
|
|
8186
8207
|
/* harmony export */ Phi3PreTrainedModel: () => (/* binding */ Phi3PreTrainedModel),
|
|
8208
|
+
/* harmony export */ Phi3VForCausalLM: () => (/* binding */ Phi3VForCausalLM),
|
|
8209
|
+
/* harmony export */ Phi3VPreTrainedModel: () => (/* binding */ Phi3VPreTrainedModel),
|
|
8187
8210
|
/* harmony export */ PhiForCausalLM: () => (/* binding */ PhiForCausalLM),
|
|
8188
8211
|
/* harmony export */ PhiModel: () => (/* binding */ PhiModel),
|
|
8189
8212
|
/* harmony export */ PhiPreTrainedModel: () => (/* binding */ PhiPreTrainedModel),
|
|
@@ -8429,6 +8452,7 @@ const MODEL_TYPES = {
|
|
|
8429
8452
|
ImageTextToText: 6,
|
|
8430
8453
|
Musicgen: 7,
|
|
8431
8454
|
MultiModality: 8,
|
|
8455
|
+
Phi3V: 9,
|
|
8432
8456
|
}
|
|
8433
8457
|
//////////////////////////////////////////////////
|
|
8434
8458
|
|
|
@@ -9204,6 +9228,10 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
9204
9228
|
this._forward = imageTextToTextForward;
|
|
9205
9229
|
this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
|
|
9206
9230
|
break;
|
|
9231
|
+
case MODEL_TYPES.Phi3V:
|
|
9232
|
+
this.can_generate = true;
|
|
9233
|
+
this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
|
|
9234
|
+
break;
|
|
9207
9235
|
|
|
9208
9236
|
case MODEL_TYPES.MultiModality:
|
|
9209
9237
|
this.can_generate = true;
|
|
@@ -9368,6 +9396,18 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
9368
9396
|
}, options),
|
|
9369
9397
|
]);
|
|
9370
9398
|
|
|
9399
|
+
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
9400
|
+
info = await Promise.all([
|
|
9401
|
+
constructSessions(pretrained_model_name_or_path, {
|
|
9402
|
+
prepare_inputs_embeds: 'prepare_inputs_embeds',
|
|
9403
|
+
model: 'model',
|
|
9404
|
+
vision_encoder: 'vision_encoder',
|
|
9405
|
+
}, options),
|
|
9406
|
+
getOptionalConfigs(pretrained_model_name_or_path, {
|
|
9407
|
+
generation_config: 'generation_config.json',
|
|
9408
|
+
}, options),
|
|
9409
|
+
]);
|
|
9410
|
+
|
|
9371
9411
|
} else { // should be MODEL_TYPES.EncoderOnly
|
|
9372
9412
|
if (modelType !== MODEL_TYPES.EncoderOnly) {
|
|
9373
9413
|
const type = modelName ?? config?.model_type;
|
|
@@ -10232,6 +10272,49 @@ class BertForQuestionAnswering extends BertPreTrainedModel {
|
|
|
10232
10272
|
}
|
|
10233
10273
|
//////////////////////////////////////////////////
|
|
10234
10274
|
|
|
10275
|
+
//////////////////////////////////////////////////
|
|
10276
|
+
// ModernBert models
|
|
10277
|
+
class ModernBertPreTrainedModel extends PreTrainedModel { }
|
|
10278
|
+
class ModernBertModel extends ModernBertPreTrainedModel { }
|
|
10279
|
+
|
|
10280
|
+
class ModernBertForMaskedLM extends ModernBertPreTrainedModel {
|
|
10281
|
+
/**
|
|
10282
|
+
* Calls the model on new inputs.
|
|
10283
|
+
*
|
|
10284
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
10285
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
10286
|
+
*/
|
|
10287
|
+
async _call(model_inputs) {
|
|
10288
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
10289
|
+
}
|
|
10290
|
+
}
|
|
10291
|
+
|
|
10292
|
+
class ModernBertForSequenceClassification extends ModernBertPreTrainedModel {
|
|
10293
|
+
/**
|
|
10294
|
+
* Calls the model on new inputs.
|
|
10295
|
+
*
|
|
10296
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
10297
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
10298
|
+
*/
|
|
10299
|
+
async _call(model_inputs) {
|
|
10300
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
10301
|
+
}
|
|
10302
|
+
}
|
|
10303
|
+
|
|
10304
|
+
class ModernBertForTokenClassification extends ModernBertPreTrainedModel {
|
|
10305
|
+
/**
|
|
10306
|
+
* Calls the model on new inputs.
|
|
10307
|
+
*
|
|
10308
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
10309
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
10310
|
+
*/
|
|
10311
|
+
async _call(model_inputs) {
|
|
10312
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
10313
|
+
}
|
|
10314
|
+
}
|
|
10315
|
+
//////////////////////////////////////////////////
|
|
10316
|
+
|
|
10317
|
+
|
|
10235
10318
|
//////////////////////////////////////////////////
|
|
10236
10319
|
// NomicBert models
|
|
10237
10320
|
class NomicBertPreTrainedModel extends PreTrainedModel { }
|
|
@@ -11640,6 +11723,29 @@ class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
11640
11723
|
}
|
|
11641
11724
|
//////////////////////////////////////////////////
|
|
11642
11725
|
|
|
11726
|
+
|
|
11727
|
+
//////////////////////////////////////////////////
|
|
11728
|
+
// Moonshine models
|
|
11729
|
+
class MoonshinePreTrainedModel extends PreTrainedModel {
|
|
11730
|
+
|
|
11731
|
+
requires_attention_mask = false;
|
|
11732
|
+
main_input_name = 'input_values';
|
|
11733
|
+
forward_params = [
|
|
11734
|
+
'input_values',
|
|
11735
|
+
'decoder_input_ids',
|
|
11736
|
+
'past_key_values',
|
|
11737
|
+
];
|
|
11738
|
+
};
|
|
11739
|
+
|
|
11740
|
+
/**
|
|
11741
|
+
* MoonshineModel class for training Moonshine models without a language model head.
|
|
11742
|
+
*/
|
|
11743
|
+
class MoonshineModel extends MoonshinePreTrainedModel { }
|
|
11744
|
+
|
|
11745
|
+
class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
|
|
11746
|
+
//////////////////////////////////////////////////
|
|
11747
|
+
|
|
11748
|
+
|
|
11643
11749
|
//////////////////////////////////////////////////
|
|
11644
11750
|
/**
|
|
11645
11751
|
* Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
|
|
@@ -11910,6 +12016,77 @@ class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
|
11910
12016
|
}
|
|
11911
12017
|
//////////////////////////////////////////////////
|
|
11912
12018
|
|
|
12019
|
+
class Phi3VPreTrainedModel extends PreTrainedModel {
|
|
12020
|
+
forward_params = [
|
|
12021
|
+
'input_ids',
|
|
12022
|
+
'inputs_embeds',
|
|
12023
|
+
'attention_mask',
|
|
12024
|
+
'position_ids',
|
|
12025
|
+
'pixel_values',
|
|
12026
|
+
'image_sizes',
|
|
12027
|
+
'past_key_values',
|
|
12028
|
+
];
|
|
12029
|
+
}
|
|
12030
|
+
class Phi3VForCausalLM extends Phi3VPreTrainedModel {
|
|
12031
|
+
|
|
12032
|
+
async forward({
|
|
12033
|
+
// Produced by the tokenizer/processor:
|
|
12034
|
+
input_ids = null,
|
|
12035
|
+
attention_mask = null,
|
|
12036
|
+
pixel_values = null,
|
|
12037
|
+
image_sizes = null,
|
|
12038
|
+
|
|
12039
|
+
// Used during generation:
|
|
12040
|
+
position_ids = null,
|
|
12041
|
+
inputs_embeds = null,
|
|
12042
|
+
past_key_values = null,
|
|
12043
|
+
|
|
12044
|
+
// Generic generation parameters
|
|
12045
|
+
generation_config = null,
|
|
12046
|
+
logits_processor = null,
|
|
12047
|
+
|
|
12048
|
+
// TODO: needed?
|
|
12049
|
+
...kwargs
|
|
12050
|
+
}) {
|
|
12051
|
+
if (!inputs_embeds) {
|
|
12052
|
+
let image_features;
|
|
12053
|
+
if (pixel_values && input_ids.dims[1] !== 1) {
|
|
12054
|
+
if (!image_sizes) {
|
|
12055
|
+
throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
|
|
12056
|
+
}
|
|
12057
|
+
|
|
12058
|
+
// Encode the image
|
|
12059
|
+
({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
|
|
12060
|
+
pixel_values,
|
|
12061
|
+
image_sizes,
|
|
12062
|
+
}));
|
|
12063
|
+
} else {
|
|
12064
|
+
const hidden_size = this.config.normalized_config.hidden_size;
|
|
12065
|
+
image_features = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_9__.Tensor(
|
|
12066
|
+
'float32',
|
|
12067
|
+
[],
|
|
12068
|
+
[0, hidden_size],
|
|
12069
|
+
);
|
|
12070
|
+
}
|
|
12071
|
+
|
|
12072
|
+
({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
|
|
12073
|
+
input_ids,
|
|
12074
|
+
image_features,
|
|
12075
|
+
}));
|
|
12076
|
+
}
|
|
12077
|
+
|
|
12078
|
+
const outputs = await decoderForward(this, {
|
|
12079
|
+
inputs_embeds,
|
|
12080
|
+
past_key_values,
|
|
12081
|
+
attention_mask,
|
|
12082
|
+
position_ids,
|
|
12083
|
+
generation_config,
|
|
12084
|
+
logits_processor,
|
|
12085
|
+
}, false);
|
|
12086
|
+
return outputs;
|
|
12087
|
+
}
|
|
12088
|
+
}
|
|
12089
|
+
|
|
11913
12090
|
//////////////////////////////////////////////////
|
|
11914
12091
|
class CLIPPreTrainedModel extends PreTrainedModel { }
|
|
11915
12092
|
|
|
@@ -11964,9 +12141,11 @@ class CLIPModel extends CLIPPreTrainedModel { }
|
|
|
11964
12141
|
class CLIPTextModel extends CLIPPreTrainedModel {
|
|
11965
12142
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
11966
12143
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
11967
|
-
|
|
11968
|
-
|
|
11969
|
-
|
|
12144
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12145
|
+
// Update default model file name if not provided
|
|
12146
|
+
model_file_name: 'text_model',
|
|
12147
|
+
...options,
|
|
12148
|
+
});
|
|
11970
12149
|
}
|
|
11971
12150
|
}
|
|
11972
12151
|
|
|
@@ -11999,9 +12178,11 @@ class CLIPTextModel extends CLIPPreTrainedModel {
|
|
|
11999
12178
|
class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
12000
12179
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
12001
12180
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
12002
|
-
|
|
12003
|
-
|
|
12004
|
-
|
|
12181
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12182
|
+
// Update default model file name if not provided
|
|
12183
|
+
model_file_name: 'text_model',
|
|
12184
|
+
...options,
|
|
12185
|
+
});
|
|
12005
12186
|
}
|
|
12006
12187
|
}
|
|
12007
12188
|
|
|
@@ -12011,9 +12192,11 @@ class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
|
12011
12192
|
class CLIPVisionModel extends CLIPPreTrainedModel {
|
|
12012
12193
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
12013
12194
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
12014
|
-
|
|
12015
|
-
|
|
12016
|
-
|
|
12195
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12196
|
+
// Update default model file name if not provided
|
|
12197
|
+
model_file_name: 'vision_model',
|
|
12198
|
+
...options,
|
|
12199
|
+
});
|
|
12017
12200
|
}
|
|
12018
12201
|
}
|
|
12019
12202
|
|
|
@@ -12046,9 +12229,11 @@ class CLIPVisionModel extends CLIPPreTrainedModel {
|
|
|
12046
12229
|
class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
|
|
12047
12230
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
12048
12231
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
12049
|
-
|
|
12050
|
-
|
|
12051
|
-
|
|
12232
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12233
|
+
// Update default model file name if not provided
|
|
12234
|
+
model_file_name: 'vision_model',
|
|
12235
|
+
...options,
|
|
12236
|
+
});
|
|
12052
12237
|
}
|
|
12053
12238
|
}
|
|
12054
12239
|
//////////////////////////////////////////////////
|
|
@@ -12132,9 +12317,11 @@ class SiglipModel extends SiglipPreTrainedModel { }
|
|
|
12132
12317
|
class SiglipTextModel extends SiglipPreTrainedModel {
|
|
12133
12318
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
12134
12319
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
12135
|
-
|
|
12136
|
-
|
|
12137
|
-
|
|
12320
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12321
|
+
// Update default model file name if not provided
|
|
12322
|
+
model_file_name: 'text_model',
|
|
12323
|
+
...options,
|
|
12324
|
+
});
|
|
12138
12325
|
}
|
|
12139
12326
|
}
|
|
12140
12327
|
|
|
@@ -12167,9 +12354,11 @@ class SiglipTextModel extends SiglipPreTrainedModel {
|
|
|
12167
12354
|
class SiglipVisionModel extends CLIPPreTrainedModel {
|
|
12168
12355
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
12169
12356
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
12170
|
-
|
|
12171
|
-
|
|
12172
|
-
|
|
12357
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12358
|
+
// Update default model file name if not provided
|
|
12359
|
+
model_file_name: 'vision_model',
|
|
12360
|
+
...options,
|
|
12361
|
+
});
|
|
12173
12362
|
}
|
|
12174
12363
|
}
|
|
12175
12364
|
//////////////////////////////////////////////////
|
|
@@ -12224,18 +12413,22 @@ class JinaCLIPModel extends JinaCLIPPreTrainedModel {
|
|
|
12224
12413
|
class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
|
|
12225
12414
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
12226
12415
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
12227
|
-
|
|
12228
|
-
|
|
12229
|
-
|
|
12416
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12417
|
+
// Update default model file name if not provided
|
|
12418
|
+
model_file_name: 'text_model',
|
|
12419
|
+
...options,
|
|
12420
|
+
});
|
|
12230
12421
|
}
|
|
12231
12422
|
}
|
|
12232
12423
|
|
|
12233
12424
|
class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
|
|
12234
12425
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
12235
12426
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
12236
|
-
|
|
12237
|
-
|
|
12238
|
-
|
|
12427
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
12428
|
+
// Update default model file name if not provided
|
|
12429
|
+
model_file_name: 'vision_model',
|
|
12430
|
+
...options,
|
|
12431
|
+
});
|
|
12239
12432
|
}
|
|
12240
12433
|
}
|
|
12241
12434
|
//////////////////////////////////////////////////
|
|
@@ -12395,6 +12588,14 @@ class LlamaForCausalLM extends LlamaPreTrainedModel { }
|
|
|
12395
12588
|
//////////////////////////////////////////////////
|
|
12396
12589
|
|
|
12397
12590
|
|
|
12591
|
+
//////////////////////////////////////////////////
|
|
12592
|
+
// EXAONE models
|
|
12593
|
+
class ExaonePreTrainedModel extends PreTrainedModel { }
|
|
12594
|
+
class ExaoneModel extends ExaonePreTrainedModel { }
|
|
12595
|
+
class ExaoneForCausalLM extends ExaonePreTrainedModel { }
|
|
12596
|
+
//////////////////////////////////////////////////
|
|
12597
|
+
|
|
12598
|
+
|
|
12398
12599
|
//////////////////////////////////////////////////
|
|
12399
12600
|
// MobileLLM models
|
|
12400
12601
|
class MobileLLMPreTrainedModel extends PreTrainedModel { }
|
|
@@ -14457,9 +14658,11 @@ class ClapModel extends ClapPreTrainedModel { }
|
|
|
14457
14658
|
class ClapTextModelWithProjection extends ClapPreTrainedModel {
|
|
14458
14659
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
14459
14660
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
14460
|
-
|
|
14461
|
-
|
|
14462
|
-
|
|
14661
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
14662
|
+
// Update default model file name if not provided
|
|
14663
|
+
model_file_name: 'text_model',
|
|
14664
|
+
...options,
|
|
14665
|
+
});
|
|
14463
14666
|
}
|
|
14464
14667
|
}
|
|
14465
14668
|
|
|
@@ -14492,9 +14695,11 @@ class ClapTextModelWithProjection extends ClapPreTrainedModel {
|
|
|
14492
14695
|
class ClapAudioModelWithProjection extends ClapPreTrainedModel {
|
|
14493
14696
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
14494
14697
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
14495
|
-
|
|
14496
|
-
|
|
14497
|
-
|
|
14698
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
14699
|
+
// Update default model file name if not provided
|
|
14700
|
+
model_file_name: 'audio_model',
|
|
14701
|
+
...options,
|
|
14702
|
+
});
|
|
14498
14703
|
}
|
|
14499
14704
|
}
|
|
14500
14705
|
//////////////////////////////////////////////////
|
|
@@ -15080,6 +15285,7 @@ class PretrainedMixin {
|
|
|
15080
15285
|
|
|
15081
15286
|
const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
|
|
15082
15287
|
['bert', ['BertModel', BertModel]],
|
|
15288
|
+
['modernbert', ['ModernBertModel', ModernBertModel]],
|
|
15083
15289
|
['nomic_bert', ['NomicBertModel', NomicBertModel]],
|
|
15084
15290
|
['roformer', ['RoFormerModel', RoFormerModel]],
|
|
15085
15291
|
['electra', ['ElectraModel', ElectraModel]],
|
|
@@ -15181,6 +15387,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
15181
15387
|
['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
|
|
15182
15388
|
['codegen', ['CodeGenModel', CodeGenModel]],
|
|
15183
15389
|
['llama', ['LlamaModel', LlamaModel]],
|
|
15390
|
+
['exaone', ['ExaoneModel', ExaoneModel]],
|
|
15184
15391
|
['olmo', ['OlmoModel', OlmoModel]],
|
|
15185
15392
|
['olmo2', ['Olmo2Model', Olmo2Model]],
|
|
15186
15393
|
['mobilellm', ['MobileLLMModel', MobileLLMModel]],
|
|
@@ -15203,6 +15410,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
15203
15410
|
const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
|
|
15204
15411
|
['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
|
|
15205
15412
|
['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
|
|
15413
|
+
['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
|
|
15206
15414
|
]);
|
|
15207
15415
|
|
|
15208
15416
|
const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
|
|
@@ -15216,6 +15424,7 @@ const MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = new Map([
|
|
|
15216
15424
|
|
|
15217
15425
|
const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([
|
|
15218
15426
|
['bert', ['BertForSequenceClassification', BertForSequenceClassification]],
|
|
15427
|
+
['modernbert', ['ModernBertForSequenceClassification', ModernBertForSequenceClassification]],
|
|
15219
15428
|
['roformer', ['RoFormerForSequenceClassification', RoFormerForSequenceClassification]],
|
|
15220
15429
|
['electra', ['ElectraForSequenceClassification', ElectraForSequenceClassification]],
|
|
15221
15430
|
['esm', ['EsmForSequenceClassification', EsmForSequenceClassification]],
|
|
@@ -15237,6 +15446,7 @@ const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([
|
|
|
15237
15446
|
|
|
15238
15447
|
const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([
|
|
15239
15448
|
['bert', ['BertForTokenClassification', BertForTokenClassification]],
|
|
15449
|
+
['modernbert', ['ModernBertForTokenClassification', ModernBertForTokenClassification]],
|
|
15240
15450
|
['roformer', ['RoFormerForTokenClassification', RoFormerForTokenClassification]],
|
|
15241
15451
|
['electra', ['ElectraForTokenClassification', ElectraForTokenClassification]],
|
|
15242
15452
|
['esm', ['EsmForTokenClassification', EsmForTokenClassification]],
|
|
@@ -15273,6 +15483,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
|
|
|
15273
15483
|
['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
|
|
15274
15484
|
['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
|
|
15275
15485
|
['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
|
|
15486
|
+
['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
|
|
15276
15487
|
['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
|
|
15277
15488
|
['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
|
|
15278
15489
|
['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
|
|
@@ -15292,6 +15503,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
|
|
|
15292
15503
|
['falcon', ['FalconForCausalLM', FalconForCausalLM]],
|
|
15293
15504
|
['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
|
|
15294
15505
|
['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
|
|
15506
|
+
|
|
15507
|
+
// Also image-text-to-text
|
|
15508
|
+
['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
|
|
15295
15509
|
]);
|
|
15296
15510
|
|
|
15297
15511
|
const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
|
|
@@ -15301,6 +15515,7 @@ const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
|
|
|
15301
15515
|
|
|
15302
15516
|
const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([
|
|
15303
15517
|
['bert', ['BertForMaskedLM', BertForMaskedLM]],
|
|
15518
|
+
['modernbert', ['ModernBertForMaskedLM', ModernBertForMaskedLM]],
|
|
15304
15519
|
['roformer', ['RoFormerForMaskedLM', RoFormerForMaskedLM]],
|
|
15305
15520
|
['electra', ['ElectraForMaskedLM', ElectraForMaskedLM]],
|
|
15306
15521
|
['esm', ['EsmForMaskedLM', EsmForMaskedLM]],
|
|
@@ -15529,6 +15744,7 @@ const CUSTOM_MAPPING = [
|
|
|
15529
15744
|
// OVERRIDE:
|
|
15530
15745
|
// TODO: Refactor to allow class to specify model
|
|
15531
15746
|
['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
|
|
15747
|
+
['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
|
|
15532
15748
|
|
|
15533
15749
|
['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
|
|
15534
15750
|
['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
|
|
@@ -16783,23 +16999,26 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
16783
16999
|
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
16784
17000
|
/* harmony export */ ASTFeatureExtractor: () => (/* reexport safe */ _audio_spectrogram_transformer_feature_extraction_audio_spectrogram_transformer_js__WEBPACK_IMPORTED_MODULE_0__.ASTFeatureExtractor),
|
|
16785
17001
|
/* harmony export */ ClapFeatureExtractor: () => (/* reexport safe */ _clap_feature_extraction_clap_js__WEBPACK_IMPORTED_MODULE_1__.ClapFeatureExtractor),
|
|
16786
|
-
/* harmony export */ ImageFeatureExtractor: () => (/* reexport safe */
|
|
16787
|
-
/* harmony export */
|
|
16788
|
-
/* harmony export */
|
|
16789
|
-
/* harmony export */
|
|
16790
|
-
/* harmony export */
|
|
16791
|
-
/* harmony export */
|
|
16792
|
-
/* harmony export */
|
|
17002
|
+
/* harmony export */ ImageFeatureExtractor: () => (/* reexport safe */ _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_9__.ImageProcessor),
|
|
17003
|
+
/* harmony export */ MoonshineFeatureExtractor: () => (/* reexport safe */ _moonshine_feature_extraction_moonshine_js__WEBPACK_IMPORTED_MODULE_2__.MoonshineFeatureExtractor),
|
|
17004
|
+
/* harmony export */ PyAnnoteFeatureExtractor: () => (/* reexport safe */ _pyannote_feature_extraction_pyannote_js__WEBPACK_IMPORTED_MODULE_3__.PyAnnoteFeatureExtractor),
|
|
17005
|
+
/* harmony export */ SeamlessM4TFeatureExtractor: () => (/* reexport safe */ _seamless_m4t_feature_extraction_seamless_m4t_js__WEBPACK_IMPORTED_MODULE_4__.SeamlessM4TFeatureExtractor),
|
|
17006
|
+
/* harmony export */ SpeechT5FeatureExtractor: () => (/* reexport safe */ _speecht5_feature_extraction_speecht5_js__WEBPACK_IMPORTED_MODULE_5__.SpeechT5FeatureExtractor),
|
|
17007
|
+
/* harmony export */ Wav2Vec2FeatureExtractor: () => (/* reexport safe */ _wav2vec2_feature_extraction_wav2vec2_js__WEBPACK_IMPORTED_MODULE_6__.Wav2Vec2FeatureExtractor),
|
|
17008
|
+
/* harmony export */ WeSpeakerFeatureExtractor: () => (/* reexport safe */ _wespeaker_feature_extraction_wespeaker_js__WEBPACK_IMPORTED_MODULE_7__.WeSpeakerFeatureExtractor),
|
|
17009
|
+
/* harmony export */ WhisperFeatureExtractor: () => (/* reexport safe */ _whisper_feature_extraction_whisper_js__WEBPACK_IMPORTED_MODULE_8__.WhisperFeatureExtractor)
|
|
16793
17010
|
/* harmony export */ });
|
|
16794
17011
|
/* harmony import */ var _audio_spectrogram_transformer_feature_extraction_audio_spectrogram_transformer_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js */ "./src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js");
|
|
16795
17012
|
/* harmony import */ var _clap_feature_extraction_clap_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./clap/feature_extraction_clap.js */ "./src/models/clap/feature_extraction_clap.js");
|
|
16796
|
-
/* harmony import */ var
|
|
16797
|
-
/* harmony import */ var
|
|
16798
|
-
/* harmony import */ var
|
|
16799
|
-
/* harmony import */ var
|
|
16800
|
-
/* harmony import */ var
|
|
16801
|
-
/* harmony import */ var
|
|
16802
|
-
/* harmony import */ var
|
|
17013
|
+
/* harmony import */ var _moonshine_feature_extraction_moonshine_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./moonshine/feature_extraction_moonshine.js */ "./src/models/moonshine/feature_extraction_moonshine.js");
|
|
17014
|
+
/* harmony import */ var _pyannote_feature_extraction_pyannote_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./pyannote/feature_extraction_pyannote.js */ "./src/models/pyannote/feature_extraction_pyannote.js");
|
|
17015
|
+
/* harmony import */ var _seamless_m4t_feature_extraction_seamless_m4t_js__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! ./seamless_m4t/feature_extraction_seamless_m4t.js */ "./src/models/seamless_m4t/feature_extraction_seamless_m4t.js");
|
|
17016
|
+
/* harmony import */ var _speecht5_feature_extraction_speecht5_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./speecht5/feature_extraction_speecht5.js */ "./src/models/speecht5/feature_extraction_speecht5.js");
|
|
17017
|
+
/* harmony import */ var _wav2vec2_feature_extraction_wav2vec2_js__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! ./wav2vec2/feature_extraction_wav2vec2.js */ "./src/models/wav2vec2/feature_extraction_wav2vec2.js");
|
|
17018
|
+
/* harmony import */ var _wespeaker_feature_extraction_wespeaker_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./wespeaker/feature_extraction_wespeaker.js */ "./src/models/wespeaker/feature_extraction_wespeaker.js");
|
|
17019
|
+
/* harmony import */ var _whisper_feature_extraction_whisper_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./whisper/feature_extraction_whisper.js */ "./src/models/whisper/feature_extraction_whisper.js");
|
|
17020
|
+
/* harmony import */ var _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ../base/image_processors_utils.js */ "./src/base/image_processors_utils.js");
|
|
17021
|
+
|
|
16803
17022
|
|
|
16804
17023
|
|
|
16805
17024
|
|
|
@@ -17180,18 +17399,29 @@ class Idefics3ImageProcessor extends _base_image_processors_utils_js__WEBPACK_IM
|
|
|
17180
17399
|
const optimal_width = Math.ceil(width / num_splits_w);
|
|
17181
17400
|
|
|
17182
17401
|
// Iterate through each row and column
|
|
17183
|
-
for (let r = 0; r < num_splits_h; r
|
|
17184
|
-
for (let c = 0; c < num_splits_w; c
|
|
17185
|
-
|
|
17186
|
-
|
|
17187
|
-
|
|
17188
|
-
|
|
17189
|
-
|
|
17190
|
-
|
|
17191
|
-
|
|
17192
|
-
|
|
17193
|
-
//
|
|
17194
|
-
|
|
17402
|
+
for (let r = 0; r < num_splits_h; ++r) {
|
|
17403
|
+
for (let c = 0; c < num_splits_w; ++c) {
|
|
17404
|
+
let start_x, start_y, end_x, end_y;
|
|
17405
|
+
if (r === num_splits_h - 1) { // At bottom
|
|
17406
|
+
start_y = height - optimal_height;
|
|
17407
|
+
end_y = height;
|
|
17408
|
+
} else {
|
|
17409
|
+
start_y = r * optimal_height;
|
|
17410
|
+
end_y = (r + 1) * optimal_height;
|
|
17411
|
+
}
|
|
17412
|
+
if (c === num_splits_w - 1) { // At right
|
|
17413
|
+
start_x = width - optimal_width;
|
|
17414
|
+
end_x = width;
|
|
17415
|
+
} else {
|
|
17416
|
+
start_x = c * optimal_width;
|
|
17417
|
+
end_x = (c + 1) * optimal_width;
|
|
17418
|
+
}
|
|
17419
|
+
|
|
17420
|
+
const starts = [start_y, start_x];
|
|
17421
|
+
const ends = [end_y, end_x];
|
|
17422
|
+
|
|
17423
|
+
const patch = await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.slice)(pixel_values, starts, ends, [2, 3]);
|
|
17424
|
+
frames.push(patch);
|
|
17195
17425
|
}
|
|
17196
17426
|
}
|
|
17197
17427
|
|
|
@@ -17417,21 +17647,22 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
17417
17647
|
/* harmony export */ OwlViTFeatureExtractor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__.OwlViTFeatureExtractor),
|
|
17418
17648
|
/* harmony export */ OwlViTImageProcessor: () => (/* reexport safe */ _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__.OwlViTImageProcessor),
|
|
17419
17649
|
/* harmony export */ Owlv2ImageProcessor: () => (/* reexport safe */ _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_23__.Owlv2ImageProcessor),
|
|
17420
|
-
/* harmony export */
|
|
17421
|
-
/* harmony export */
|
|
17422
|
-
/* harmony export */
|
|
17423
|
-
/* harmony export */
|
|
17424
|
-
/* harmony export */
|
|
17425
|
-
/* harmony export */
|
|
17426
|
-
/* harmony export */
|
|
17427
|
-
/* harmony export */
|
|
17650
|
+
/* harmony export */ Phi3VImageProcessor: () => (/* reexport safe */ _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_25__.Phi3VImageProcessor),
|
|
17651
|
+
/* harmony export */ PvtImageProcessor: () => (/* reexport safe */ _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_26__.PvtImageProcessor),
|
|
17652
|
+
/* harmony export */ Qwen2VLImageProcessor: () => (/* reexport safe */ _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_27__.Qwen2VLImageProcessor),
|
|
17653
|
+
/* harmony export */ RTDetrImageProcessor: () => (/* reexport safe */ _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_28__.RTDetrImageProcessor),
|
|
17654
|
+
/* harmony export */ SamImageProcessor: () => (/* reexport safe */ _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_29__.SamImageProcessor),
|
|
17655
|
+
/* harmony export */ SegformerFeatureExtractor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__.SegformerFeatureExtractor),
|
|
17656
|
+
/* harmony export */ SegformerImageProcessor: () => (/* reexport safe */ _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__.SegformerImageProcessor),
|
|
17657
|
+
/* harmony export */ SiglipImageProcessor: () => (/* reexport safe */ _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_31__.SiglipImageProcessor),
|
|
17658
|
+
/* harmony export */ Swin2SRImageProcessor: () => (/* reexport safe */ _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_32__.Swin2SRImageProcessor),
|
|
17428
17659
|
/* harmony export */ VLMImageProcessor: () => (/* reexport safe */ _janus_image_processing_janus_js__WEBPACK_IMPORTED_MODULE_12__.VLMImageProcessor),
|
|
17429
|
-
/* harmony export */ ViTFeatureExtractor: () => (/* reexport safe */
|
|
17430
|
-
/* harmony export */ ViTImageProcessor: () => (/* reexport safe */
|
|
17431
|
-
/* harmony export */ VitMatteImageProcessor: () => (/* reexport safe */
|
|
17432
|
-
/* harmony export */ VitPoseImageProcessor: () => (/* reexport safe */
|
|
17433
|
-
/* harmony export */ YolosFeatureExtractor: () => (/* reexport safe */
|
|
17434
|
-
/* harmony export */ YolosImageProcessor: () => (/* reexport safe */
|
|
17660
|
+
/* harmony export */ ViTFeatureExtractor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__.ViTFeatureExtractor),
|
|
17661
|
+
/* harmony export */ ViTImageProcessor: () => (/* reexport safe */ _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__.ViTImageProcessor),
|
|
17662
|
+
/* harmony export */ VitMatteImageProcessor: () => (/* reexport safe */ _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_34__.VitMatteImageProcessor),
|
|
17663
|
+
/* harmony export */ VitPoseImageProcessor: () => (/* reexport safe */ _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_35__.VitPoseImageProcessor),
|
|
17664
|
+
/* harmony export */ YolosFeatureExtractor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__.YolosFeatureExtractor),
|
|
17665
|
+
/* harmony export */ YolosImageProcessor: () => (/* reexport safe */ _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__.YolosImageProcessor)
|
|
17435
17666
|
/* harmony export */ });
|
|
17436
17667
|
/* harmony import */ var _beit_image_processing_beit_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./beit/image_processing_beit.js */ "./src/models/beit/image_processing_beit.js");
|
|
17437
17668
|
/* harmony import */ var _bit_image_processing_bit_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./bit/image_processing_bit.js */ "./src/models/bit/image_processing_bit.js");
|
|
@@ -17458,17 +17689,19 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
17458
17689
|
/* harmony import */ var _nougat_image_processing_nougat_js__WEBPACK_IMPORTED_MODULE_22__ = __webpack_require__(/*! ./nougat/image_processing_nougat.js */ "./src/models/nougat/image_processing_nougat.js");
|
|
17459
17690
|
/* harmony import */ var _owlv2_image_processing_owlv2_js__WEBPACK_IMPORTED_MODULE_23__ = __webpack_require__(/*! ./owlv2/image_processing_owlv2.js */ "./src/models/owlv2/image_processing_owlv2.js");
|
|
17460
17691
|
/* harmony import */ var _owlvit_image_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_24__ = __webpack_require__(/*! ./owlvit/image_processing_owlvit.js */ "./src/models/owlvit/image_processing_owlvit.js");
|
|
17461
|
-
/* harmony import */ var
|
|
17462
|
-
/* harmony import */ var
|
|
17463
|
-
/* harmony import */ var
|
|
17464
|
-
/* harmony import */ var
|
|
17465
|
-
/* harmony import */ var
|
|
17466
|
-
/* harmony import */ var
|
|
17467
|
-
/* harmony import */ var
|
|
17468
|
-
/* harmony import */ var
|
|
17469
|
-
/* harmony import */ var
|
|
17470
|
-
/* harmony import */ var
|
|
17471
|
-
/* harmony import */ var
|
|
17692
|
+
/* harmony import */ var _phi3_v_image_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_25__ = __webpack_require__(/*! ./phi3_v/image_processing_phi3_v.js */ "./src/models/phi3_v/image_processing_phi3_v.js");
|
|
17693
|
+
/* harmony import */ var _pvt_image_processing_pvt_js__WEBPACK_IMPORTED_MODULE_26__ = __webpack_require__(/*! ./pvt/image_processing_pvt.js */ "./src/models/pvt/image_processing_pvt.js");
|
|
17694
|
+
/* harmony import */ var _qwen2_vl_image_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_27__ = __webpack_require__(/*! ./qwen2_vl/image_processing_qwen2_vl.js */ "./src/models/qwen2_vl/image_processing_qwen2_vl.js");
|
|
17695
|
+
/* harmony import */ var _rt_detr_image_processing_rt_detr_js__WEBPACK_IMPORTED_MODULE_28__ = __webpack_require__(/*! ./rt_detr/image_processing_rt_detr.js */ "./src/models/rt_detr/image_processing_rt_detr.js");
|
|
17696
|
+
/* harmony import */ var _sam_image_processing_sam_js__WEBPACK_IMPORTED_MODULE_29__ = __webpack_require__(/*! ./sam/image_processing_sam.js */ "./src/models/sam/image_processing_sam.js");
|
|
17697
|
+
/* harmony import */ var _segformer_image_processing_segformer_js__WEBPACK_IMPORTED_MODULE_30__ = __webpack_require__(/*! ./segformer/image_processing_segformer.js */ "./src/models/segformer/image_processing_segformer.js");
|
|
17698
|
+
/* harmony import */ var _siglip_image_processing_siglip_js__WEBPACK_IMPORTED_MODULE_31__ = __webpack_require__(/*! ./siglip/image_processing_siglip.js */ "./src/models/siglip/image_processing_siglip.js");
|
|
17699
|
+
/* harmony import */ var _swin2sr_image_processing_swin2sr_js__WEBPACK_IMPORTED_MODULE_32__ = __webpack_require__(/*! ./swin2sr/image_processing_swin2sr.js */ "./src/models/swin2sr/image_processing_swin2sr.js");
|
|
17700
|
+
/* harmony import */ var _vit_image_processing_vit_js__WEBPACK_IMPORTED_MODULE_33__ = __webpack_require__(/*! ./vit/image_processing_vit.js */ "./src/models/vit/image_processing_vit.js");
|
|
17701
|
+
/* harmony import */ var _vitmatte_image_processing_vitmatte_js__WEBPACK_IMPORTED_MODULE_34__ = __webpack_require__(/*! ./vitmatte/image_processing_vitmatte.js */ "./src/models/vitmatte/image_processing_vitmatte.js");
|
|
17702
|
+
/* harmony import */ var _vitpose_image_processing_vitpose_js__WEBPACK_IMPORTED_MODULE_35__ = __webpack_require__(/*! ./vitpose/image_processing_vitpose.js */ "./src/models/vitpose/image_processing_vitpose.js");
|
|
17703
|
+
/* harmony import */ var _yolos_image_processing_yolos_js__WEBPACK_IMPORTED_MODULE_36__ = __webpack_require__(/*! ./yolos/image_processing_yolos.js */ "./src/models/yolos/image_processing_yolos.js");
|
|
17704
|
+
|
|
17472
17705
|
|
|
17473
17706
|
|
|
17474
17707
|
|
|
@@ -18143,6 +18376,87 @@ class MobileViTImageProcessor extends _base_image_processors_utils_js__WEBPACK_I
|
|
|
18143
18376
|
class MobileViTFeatureExtractor extends MobileViTImageProcessor { }
|
|
18144
18377
|
|
|
18145
18378
|
|
|
18379
|
+
/***/ }),
|
|
18380
|
+
|
|
18381
|
+
/***/ "./src/models/moonshine/feature_extraction_moonshine.js":
|
|
18382
|
+
/*!**************************************************************!*\
|
|
18383
|
+
!*** ./src/models/moonshine/feature_extraction_moonshine.js ***!
|
|
18384
|
+
\**************************************************************/
|
|
18385
|
+
/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
|
|
18386
|
+
|
|
18387
|
+
"use strict";
|
|
18388
|
+
__webpack_require__.r(__webpack_exports__);
|
|
18389
|
+
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
18390
|
+
/* harmony export */ MoonshineFeatureExtractor: () => (/* binding */ MoonshineFeatureExtractor)
|
|
18391
|
+
/* harmony export */ });
|
|
18392
|
+
/* harmony import */ var _base_feature_extraction_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/feature_extraction_utils.js */ "./src/base/feature_extraction_utils.js");
|
|
18393
|
+
/* harmony import */ var _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../../utils/tensor.js */ "./src/utils/tensor.js");
|
|
18394
|
+
|
|
18395
|
+
|
|
18396
|
+
|
|
18397
|
+
|
|
18398
|
+
class MoonshineFeatureExtractor extends _base_feature_extraction_utils_js__WEBPACK_IMPORTED_MODULE_0__.FeatureExtractor {
|
|
18399
|
+
/**
|
|
18400
|
+
* Asynchronously extracts input values from a given audio using the provided configuration.
|
|
18401
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
18402
|
+
* @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
|
|
18403
|
+
*/
|
|
18404
|
+
async _call(audio) {
|
|
18405
|
+
(0,_base_feature_extraction_utils_js__WEBPACK_IMPORTED_MODULE_0__.validate_audio_inputs)(audio, 'MoonshineFeatureExtractor');
|
|
18406
|
+
|
|
18407
|
+
if (audio instanceof Float64Array) {
|
|
18408
|
+
audio = new Float32Array(audio);
|
|
18409
|
+
}
|
|
18410
|
+
|
|
18411
|
+
const shape = [
|
|
18412
|
+
1, /* batch_size */
|
|
18413
|
+
audio.length, /* num_samples */
|
|
18414
|
+
];
|
|
18415
|
+
return {
|
|
18416
|
+
input_values: new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.Tensor('float32', audio, shape),
|
|
18417
|
+
};
|
|
18418
|
+
}
|
|
18419
|
+
}
|
|
18420
|
+
|
|
18421
|
+
|
|
18422
|
+
/***/ }),
|
|
18423
|
+
|
|
18424
|
+
/***/ "./src/models/moonshine/processing_moonshine.js":
|
|
18425
|
+
/*!******************************************************!*\
|
|
18426
|
+
!*** ./src/models/moonshine/processing_moonshine.js ***!
|
|
18427
|
+
\******************************************************/
|
|
18428
|
+
/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
|
|
18429
|
+
|
|
18430
|
+
"use strict";
|
|
18431
|
+
__webpack_require__.r(__webpack_exports__);
|
|
18432
|
+
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
18433
|
+
/* harmony export */ MoonshineProcessor: () => (/* binding */ MoonshineProcessor)
|
|
18434
|
+
/* harmony export */ });
|
|
18435
|
+
/* harmony import */ var _auto_feature_extraction_auto_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../auto/feature_extraction_auto.js */ "./src/models/auto/feature_extraction_auto.js");
|
|
18436
|
+
/* harmony import */ var _tokenizers_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../../tokenizers.js */ "./src/tokenizers.js");
|
|
18437
|
+
/* harmony import */ var _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../base/processing_utils.js */ "./src/base/processing_utils.js");
|
|
18438
|
+
|
|
18439
|
+
|
|
18440
|
+
|
|
18441
|
+
|
|
18442
|
+
/**
|
|
18443
|
+
* Represents a MoonshineProcessor that extracts features from an audio input.
|
|
18444
|
+
*/
|
|
18445
|
+
class MoonshineProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_2__.Processor {
|
|
18446
|
+
static tokenizer_class = _tokenizers_js__WEBPACK_IMPORTED_MODULE_1__.AutoTokenizer
|
|
18447
|
+
static feature_extractor_class = _auto_feature_extraction_auto_js__WEBPACK_IMPORTED_MODULE_0__.AutoFeatureExtractor
|
|
18448
|
+
|
|
18449
|
+
/**
|
|
18450
|
+
* Calls the feature_extractor function with the given audio input.
|
|
18451
|
+
* @param {any} audio The audio input to extract features from.
|
|
18452
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
18453
|
+
*/
|
|
18454
|
+
async _call(audio) {
|
|
18455
|
+
return await this.feature_extractor(audio);
|
|
18456
|
+
}
|
|
18457
|
+
}
|
|
18458
|
+
|
|
18459
|
+
|
|
18146
18460
|
/***/ }),
|
|
18147
18461
|
|
|
18148
18462
|
/***/ "./src/models/nougat/image_processing_nougat.js":
|
|
@@ -18336,6 +18650,256 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
18336
18650
|
}
|
|
18337
18651
|
|
|
18338
18652
|
|
|
18653
|
+
/***/ }),
|
|
18654
|
+
|
|
18655
|
+
/***/ "./src/models/phi3_v/image_processing_phi3_v.js":
|
|
18656
|
+
/*!******************************************************!*\
|
|
18657
|
+
!*** ./src/models/phi3_v/image_processing_phi3_v.js ***!
|
|
18658
|
+
\******************************************************/
|
|
18659
|
+
/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
|
|
18660
|
+
|
|
18661
|
+
"use strict";
|
|
18662
|
+
__webpack_require__.r(__webpack_exports__);
|
|
18663
|
+
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
18664
|
+
/* harmony export */ Phi3VImageProcessor: () => (/* binding */ Phi3VImageProcessor)
|
|
18665
|
+
/* harmony export */ });
|
|
18666
|
+
/* harmony import */ var _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/image_processors_utils.js */ "./src/base/image_processors_utils.js");
|
|
18667
|
+
/* harmony import */ var _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../../utils/tensor.js */ "./src/utils/tensor.js");
|
|
18668
|
+
|
|
18669
|
+
|
|
18670
|
+
|
|
18671
|
+
const IMAGE_SIZE = 336;
|
|
18672
|
+
const SLICE_AXES = [2, 3]; // axes to slice on
|
|
18673
|
+
const { ceil, floor, sqrt } = Math;
|
|
18674
|
+
|
|
18675
|
+
class Phi3VImageProcessor extends _base_image_processors_utils_js__WEBPACK_IMPORTED_MODULE_0__.ImageProcessor {
|
|
18676
|
+
constructor(config) {
|
|
18677
|
+
super({
|
|
18678
|
+
...config,
|
|
18679
|
+
do_normalize: true,
|
|
18680
|
+
do_pad: true,
|
|
18681
|
+
pad_size: 'custom',
|
|
18682
|
+
do_convert_rgb: true,
|
|
18683
|
+
do_resize: true, // Smart resizing "hd_transform"
|
|
18684
|
+
});
|
|
18685
|
+
|
|
18686
|
+
this._num_crops = config.num_crops;
|
|
18687
|
+
}
|
|
18688
|
+
calc_num_image_tokens_from_image_size(width, height) {
|
|
18689
|
+
// @ts-expect-error
|
|
18690
|
+
const { num_img_tokens } = this.config;
|
|
18691
|
+
return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
|
|
18692
|
+
}
|
|
18693
|
+
|
|
18694
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
18695
|
+
get_resize_output_image_size(image, size) {
|
|
18696
|
+
const hd_num = this._num_crops;
|
|
18697
|
+
const [width, height] = image.size
|
|
18698
|
+
|
|
18699
|
+
let ratio = width / height;
|
|
18700
|
+
let scale = 1;
|
|
18701
|
+
|
|
18702
|
+
// Calculate the scaling factor
|
|
18703
|
+
while (scale * Math.ceil(scale / ratio) <= hd_num) {
|
|
18704
|
+
scale += 1;
|
|
18705
|
+
}
|
|
18706
|
+
scale -= 1;
|
|
18707
|
+
|
|
18708
|
+
// Compute the new dimensions
|
|
18709
|
+
const new_w = Math.floor(scale * 336);
|
|
18710
|
+
const new_h = Math.floor(new_w / ratio);
|
|
18711
|
+
|
|
18712
|
+
return [new_w, new_h]
|
|
18713
|
+
}
|
|
18714
|
+
|
|
18715
|
+
|
|
18716
|
+
/** @type {ImageProcessor['pad_image']} */
|
|
18717
|
+
pad_image(pixelData, imgDims, padSize, options = {}) {
|
|
18718
|
+
// Phi3V uses a custom padding strategy:
|
|
18719
|
+
// - Pad to a multiple of 336
|
|
18720
|
+
// - Pad with white pixels
|
|
18721
|
+
const [imageHeight, imageWidth] = imgDims;
|
|
18722
|
+
const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
|
|
18723
|
+
const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
|
|
18724
|
+
|
|
18725
|
+
// NOTE: Since padding is done after normalization, we need to fill with the normalized values
|
|
18726
|
+
const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
|
|
18727
|
+
return super.pad_image(pixelData, imgDims, { width, height }, {
|
|
18728
|
+
center: true,
|
|
18729
|
+
constant_values,
|
|
18730
|
+
...options,
|
|
18731
|
+
});
|
|
18732
|
+
}
|
|
18733
|
+
|
|
18734
|
+
async _call(images, {
|
|
18735
|
+
num_crops = null,
|
|
18736
|
+
} = {}) {
|
|
18737
|
+
// @ts-expect-error
|
|
18738
|
+
this._num_crops = num_crops ??= this.config.num_crops;
|
|
18739
|
+
if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
|
|
18740
|
+
throw new Error("num_crops must be a square number >= 4");
|
|
18741
|
+
}
|
|
18742
|
+
|
|
18743
|
+
if (!Array.isArray(images)) {
|
|
18744
|
+
images = [images];
|
|
18745
|
+
}
|
|
18746
|
+
|
|
18747
|
+
const num_images = images.length;
|
|
18748
|
+
const imageData = await Promise.all(images.map(x => this.preprocess(x)));
|
|
18749
|
+
|
|
18750
|
+
const original_sizes = imageData.map(x => x.original_size);
|
|
18751
|
+
const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
|
|
18752
|
+
|
|
18753
|
+
// Process each image in batch
|
|
18754
|
+
const all_pixel_values = [];
|
|
18755
|
+
for (const { pixel_values } of imageData) {
|
|
18756
|
+
pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
|
|
18757
|
+
|
|
18758
|
+
const [height, width] = pixel_values.dims.slice(-2);
|
|
18759
|
+
|
|
18760
|
+
// Global image (Tensor of shape [num_channels, height, width])
|
|
18761
|
+
const batch_pixel_values = await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.interpolate_4d)(pixel_values, {
|
|
18762
|
+
size: [IMAGE_SIZE, IMAGE_SIZE],
|
|
18763
|
+
mode: 'bicubic',
|
|
18764
|
+
});
|
|
18765
|
+
|
|
18766
|
+
if (num_crops > 0) {
|
|
18767
|
+
const patches = [];
|
|
18768
|
+
const sqrt_patches = sqrt(num_crops);
|
|
18769
|
+
const patch_width = floor(width / sqrt_patches);
|
|
18770
|
+
const patch_height = floor(height / sqrt_patches);
|
|
18771
|
+
for (let y = 0; y < sqrt_patches; ++y) {
|
|
18772
|
+
for (let x = 0; x < sqrt_patches; ++x) {
|
|
18773
|
+
let start_x, start_y, end_x, end_y;
|
|
18774
|
+
if (y === sqrt_patches - 1) { // At bottom
|
|
18775
|
+
start_y = height - patch_height;
|
|
18776
|
+
end_y = height;
|
|
18777
|
+
} else {
|
|
18778
|
+
start_y = y * patch_height;
|
|
18779
|
+
end_y = (y + 1) * patch_height;
|
|
18780
|
+
}
|
|
18781
|
+
if (x === sqrt_patches - 1) { // At right
|
|
18782
|
+
start_x = width - patch_width;
|
|
18783
|
+
end_x = width;
|
|
18784
|
+
} else {
|
|
18785
|
+
start_x = x * patch_width;
|
|
18786
|
+
end_x = (x + 1) * patch_width;
|
|
18787
|
+
}
|
|
18788
|
+
|
|
18789
|
+
const starts = [start_y, start_x];
|
|
18790
|
+
const ends = [end_y, end_x];
|
|
18791
|
+
const patch = await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.slice)(pixel_values, starts, ends, SLICE_AXES);
|
|
18792
|
+
patches.push(patch);
|
|
18793
|
+
}
|
|
18794
|
+
}
|
|
18795
|
+
|
|
18796
|
+
const resized_tensors = await (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.interpolate_4d)((0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.cat)(patches, 0), {
|
|
18797
|
+
size: [IMAGE_SIZE, IMAGE_SIZE],
|
|
18798
|
+
mode: 'bicubic',
|
|
18799
|
+
}); // [num_crops, 3, 336, 336]
|
|
18800
|
+
|
|
18801
|
+
// Concatenate the global image with the patches
|
|
18802
|
+
all_pixel_values.push((0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.cat)([batch_pixel_values, resized_tensors], 0));
|
|
18803
|
+
} else {
|
|
18804
|
+
// Only use the global image
|
|
18805
|
+
// NOTE: Not currently supported in modelling code
|
|
18806
|
+
all_pixel_values.push(batch_pixel_values);
|
|
18807
|
+
}
|
|
18808
|
+
}
|
|
18809
|
+
|
|
18810
|
+
// [num_images, 1 + num_crops, num_channels=3, height, width]
|
|
18811
|
+
const pixel_values = (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.stack)(all_pixel_values, 0);
|
|
18812
|
+
|
|
18813
|
+
// Calculate padded image sizes
|
|
18814
|
+
const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
|
|
18815
|
+
|
|
18816
|
+
const image_sizes = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__.Tensor(
|
|
18817
|
+
'int64',
|
|
18818
|
+
sizes.flat(),
|
|
18819
|
+
[num_images, 2],
|
|
18820
|
+
);
|
|
18821
|
+
|
|
18822
|
+
const num_img_tokens = sizes.map(
|
|
18823
|
+
([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
|
|
18824
|
+
);
|
|
18825
|
+
|
|
18826
|
+
return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
|
|
18827
|
+
}
|
|
18828
|
+
}
|
|
18829
|
+
|
|
18830
|
+
|
|
18831
|
+
/***/ }),
|
|
18832
|
+
|
|
18833
|
+
/***/ "./src/models/phi3_v/processing_phi3_v.js":
|
|
18834
|
+
/*!************************************************!*\
|
|
18835
|
+
!*** ./src/models/phi3_v/processing_phi3_v.js ***!
|
|
18836
|
+
\************************************************/
|
|
18837
|
+
/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
|
|
18838
|
+
|
|
18839
|
+
"use strict";
|
|
18840
|
+
__webpack_require__.r(__webpack_exports__);
|
|
18841
|
+
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
18842
|
+
/* harmony export */ Phi3VProcessor: () => (/* binding */ Phi3VProcessor)
|
|
18843
|
+
/* harmony export */ });
|
|
18844
|
+
/* harmony import */ var _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/processing_utils.js */ "./src/base/processing_utils.js");
|
|
18845
|
+
/* harmony import */ var _auto_image_processing_auto_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../auto/image_processing_auto.js */ "./src/models/auto/image_processing_auto.js");
|
|
18846
|
+
/* harmony import */ var _tokenizers_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../tokenizers.js */ "./src/tokenizers.js");
|
|
18847
|
+
/* harmony import */ var _utils_image_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ../../utils/image.js */ "./src/utils/image.js");
|
|
18848
|
+
|
|
18849
|
+
|
|
18850
|
+
|
|
18851
|
+
|
|
18852
|
+
|
|
18853
|
+
const IMAGE_TOKEN = "<|image|>";
|
|
18854
|
+
const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
|
|
18855
|
+
|
|
18856
|
+
class Phi3VProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__.Processor {
|
|
18857
|
+
static image_processor_class = _auto_image_processing_auto_js__WEBPACK_IMPORTED_MODULE_1__.AutoImageProcessor
|
|
18858
|
+
static tokenizer_class = _tokenizers_js__WEBPACK_IMPORTED_MODULE_2__.AutoTokenizer
|
|
18859
|
+
|
|
18860
|
+
/**
|
|
18861
|
+
*
|
|
18862
|
+
* @param {string|string[]} text
|
|
18863
|
+
* @param {RawImage|RawImage[]} images
|
|
18864
|
+
* @param {...any} args
|
|
18865
|
+
* @returns {Promise<any>}
|
|
18866
|
+
*/
|
|
18867
|
+
async _call(text, images = null, {
|
|
18868
|
+
padding = true,
|
|
18869
|
+
truncation = true,
|
|
18870
|
+
num_crops = null,
|
|
18871
|
+
} = {}) {
|
|
18872
|
+
|
|
18873
|
+
if (!Array.isArray(text)) {
|
|
18874
|
+
text = [text];
|
|
18875
|
+
}
|
|
18876
|
+
|
|
18877
|
+
let text_inputs, image_inputs;
|
|
18878
|
+
if (images) {
|
|
18879
|
+
image_inputs = await this.image_processor(images, { num_crops });
|
|
18880
|
+
const { num_img_tokens } = image_inputs;
|
|
18881
|
+
|
|
18882
|
+
// The original implementation adds a bos_token before the image tokens
|
|
18883
|
+
// TODO: Check if this affects performance, since it looks like a bug in the original implementation
|
|
18884
|
+
const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
|
|
18885
|
+
|
|
18886
|
+
text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
|
|
18887
|
+
|
|
18888
|
+
// The model expects image tokens to be negative, so we negate the image token ids
|
|
18889
|
+
const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
|
|
18890
|
+
text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
|
|
18891
|
+
} else {
|
|
18892
|
+
text_inputs = this.tokenizer(text);
|
|
18893
|
+
}
|
|
18894
|
+
|
|
18895
|
+
return {
|
|
18896
|
+
...text_inputs,
|
|
18897
|
+
...image_inputs,
|
|
18898
|
+
}
|
|
18899
|
+
}
|
|
18900
|
+
}
|
|
18901
|
+
|
|
18902
|
+
|
|
18339
18903
|
/***/ }),
|
|
18340
18904
|
|
|
18341
18905
|
/***/ "./src/models/processors.js":
|
|
@@ -18348,32 +18912,38 @@ class PaliGemmaProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MOD
|
|
|
18348
18912
|
__webpack_require__.r(__webpack_exports__);
|
|
18349
18913
|
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
18350
18914
|
/* harmony export */ Florence2Processor: () => (/* reexport safe */ _florence2_processing_florence2_js__WEBPACK_IMPORTED_MODULE_0__.Florence2Processor),
|
|
18351
|
-
/* harmony export */ Idefics3Processor: () => (/* reexport safe */
|
|
18352
|
-
/* harmony export */ JinaCLIPProcessor: () => (/* reexport safe */
|
|
18915
|
+
/* harmony export */ Idefics3Processor: () => (/* reexport safe */ _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_3__.Idefics3Processor),
|
|
18916
|
+
/* harmony export */ JinaCLIPProcessor: () => (/* reexport safe */ _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_5__.JinaCLIPProcessor),
|
|
18353
18917
|
/* harmony export */ MgpstrProcessor: () => (/* reexport safe */ _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_1__.MgpstrProcessor),
|
|
18354
|
-
/* harmony export */
|
|
18355
|
-
/* harmony export */
|
|
18356
|
-
/* harmony export */
|
|
18357
|
-
/* harmony export */
|
|
18358
|
-
/* harmony export */
|
|
18359
|
-
/* harmony export */
|
|
18360
|
-
/* harmony export */
|
|
18361
|
-
/* harmony export */
|
|
18362
|
-
/* harmony export */
|
|
18918
|
+
/* harmony export */ MoonshineProcessor: () => (/* reexport safe */ _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_2__.MoonshineProcessor),
|
|
18919
|
+
/* harmony export */ OwlViTProcessor: () => (/* reexport safe */ _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_6__.OwlViTProcessor),
|
|
18920
|
+
/* harmony export */ PaliGemmaProcessor: () => (/* reexport safe */ _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_8__.PaliGemmaProcessor),
|
|
18921
|
+
/* harmony export */ Phi3VProcessor: () => (/* reexport safe */ _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_7__.Phi3VProcessor),
|
|
18922
|
+
/* harmony export */ PyAnnoteProcessor: () => (/* reexport safe */ _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_9__.PyAnnoteProcessor),
|
|
18923
|
+
/* harmony export */ Qwen2VLProcessor: () => (/* reexport safe */ _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_10__.Qwen2VLProcessor),
|
|
18924
|
+
/* harmony export */ SamProcessor: () => (/* reexport safe */ _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_11__.SamProcessor),
|
|
18925
|
+
/* harmony export */ SpeechT5Processor: () => (/* reexport safe */ _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_12__.SpeechT5Processor),
|
|
18926
|
+
/* harmony export */ VLChatProcessor: () => (/* reexport safe */ _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_4__.VLChatProcessor),
|
|
18927
|
+
/* harmony export */ Wav2Vec2ProcessorWithLM: () => (/* reexport safe */ _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_13__.Wav2Vec2ProcessorWithLM),
|
|
18928
|
+
/* harmony export */ WhisperProcessor: () => (/* reexport safe */ _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_14__.WhisperProcessor)
|
|
18363
18929
|
/* harmony export */ });
|
|
18364
18930
|
/* harmony import */ var _florence2_processing_florence2_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ./florence2/processing_florence2.js */ "./src/models/florence2/processing_florence2.js");
|
|
18365
18931
|
/* harmony import */ var _mgp_str_processing_mgp_str_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./mgp_str/processing_mgp_str.js */ "./src/models/mgp_str/processing_mgp_str.js");
|
|
18366
|
-
/* harmony import */ var
|
|
18367
|
-
/* harmony import */ var
|
|
18368
|
-
/* harmony import */ var
|
|
18369
|
-
/* harmony import */ var
|
|
18370
|
-
/* harmony import */ var
|
|
18371
|
-
/* harmony import */ var
|
|
18372
|
-
/* harmony import */ var
|
|
18373
|
-
/* harmony import */ var
|
|
18374
|
-
/* harmony import */ var
|
|
18375
|
-
/* harmony import */ var
|
|
18376
|
-
/* harmony import */ var
|
|
18932
|
+
/* harmony import */ var _moonshine_processing_moonshine_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ./moonshine/processing_moonshine.js */ "./src/models/moonshine/processing_moonshine.js");
|
|
18933
|
+
/* harmony import */ var _idefics3_processing_idefics3_js__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ./idefics3/processing_idefics3.js */ "./src/models/idefics3/processing_idefics3.js");
|
|
18934
|
+
/* harmony import */ var _janus_processing_janus_js__WEBPACK_IMPORTED_MODULE_4__ = __webpack_require__(/*! ./janus/processing_janus.js */ "./src/models/janus/processing_janus.js");
|
|
18935
|
+
/* harmony import */ var _jina_clip_processing_jina_clip_js__WEBPACK_IMPORTED_MODULE_5__ = __webpack_require__(/*! ./jina_clip/processing_jina_clip.js */ "./src/models/jina_clip/processing_jina_clip.js");
|
|
18936
|
+
/* harmony import */ var _owlvit_processing_owlvit_js__WEBPACK_IMPORTED_MODULE_6__ = __webpack_require__(/*! ./owlvit/processing_owlvit.js */ "./src/models/owlvit/processing_owlvit.js");
|
|
18937
|
+
/* harmony import */ var _phi3_v_processing_phi3_v_js__WEBPACK_IMPORTED_MODULE_7__ = __webpack_require__(/*! ./phi3_v/processing_phi3_v.js */ "./src/models/phi3_v/processing_phi3_v.js");
|
|
18938
|
+
/* harmony import */ var _paligemma_processing_paligemma_js__WEBPACK_IMPORTED_MODULE_8__ = __webpack_require__(/*! ./paligemma/processing_paligemma.js */ "./src/models/paligemma/processing_paligemma.js");
|
|
18939
|
+
/* harmony import */ var _pyannote_processing_pyannote_js__WEBPACK_IMPORTED_MODULE_9__ = __webpack_require__(/*! ./pyannote/processing_pyannote.js */ "./src/models/pyannote/processing_pyannote.js");
|
|
18940
|
+
/* harmony import */ var _qwen2_vl_processing_qwen2_vl_js__WEBPACK_IMPORTED_MODULE_10__ = __webpack_require__(/*! ./qwen2_vl/processing_qwen2_vl.js */ "./src/models/qwen2_vl/processing_qwen2_vl.js");
|
|
18941
|
+
/* harmony import */ var _sam_processing_sam_js__WEBPACK_IMPORTED_MODULE_11__ = __webpack_require__(/*! ./sam/processing_sam.js */ "./src/models/sam/processing_sam.js");
|
|
18942
|
+
/* harmony import */ var _speecht5_processing_speecht5_js__WEBPACK_IMPORTED_MODULE_12__ = __webpack_require__(/*! ./speecht5/processing_speecht5.js */ "./src/models/speecht5/processing_speecht5.js");
|
|
18943
|
+
/* harmony import */ var _wav2vec2_processing_wav2vec2_js__WEBPACK_IMPORTED_MODULE_13__ = __webpack_require__(/*! ./wav2vec2/processing_wav2vec2.js */ "./src/models/wav2vec2/processing_wav2vec2.js");
|
|
18944
|
+
/* harmony import */ var _whisper_processing_whisper_js__WEBPACK_IMPORTED_MODULE_14__ = __webpack_require__(/*! ./whisper/processing_whisper.js */ "./src/models/whisper/processing_whisper.js");
|
|
18945
|
+
|
|
18946
|
+
|
|
18377
18947
|
|
|
18378
18948
|
|
|
18379
18949
|
|
|
@@ -18423,6 +18993,8 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
18423
18993
|
/* harmony export */ });
|
|
18424
18994
|
/* harmony import */ var _base_feature_extraction_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/feature_extraction_utils.js */ "./src/base/feature_extraction_utils.js");
|
|
18425
18995
|
/* harmony import */ var _utils_tensor_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../../utils/tensor.js */ "./src/utils/tensor.js");
|
|
18996
|
+
/* harmony import */ var _utils_maths_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../utils/maths.js */ "./src/utils/maths.js");
|
|
18997
|
+
|
|
18426
18998
|
|
|
18427
18999
|
|
|
18428
19000
|
|
|
@@ -18450,41 +19022,6 @@ class PyAnnoteFeatureExtractor extends _base_feature_extraction_utils_js__WEBPAC
|
|
|
18450
19022
|
};
|
|
18451
19023
|
}
|
|
18452
19024
|
|
|
18453
|
-
}
|
|
18454
|
-
|
|
18455
|
-
|
|
18456
|
-
/***/ }),
|
|
18457
|
-
|
|
18458
|
-
/***/ "./src/models/pyannote/processing_pyannote.js":
|
|
18459
|
-
/*!****************************************************!*\
|
|
18460
|
-
!*** ./src/models/pyannote/processing_pyannote.js ***!
|
|
18461
|
-
\****************************************************/
|
|
18462
|
-
/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
|
|
18463
|
-
|
|
18464
|
-
"use strict";
|
|
18465
|
-
__webpack_require__.r(__webpack_exports__);
|
|
18466
|
-
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
18467
|
-
/* harmony export */ PyAnnoteProcessor: () => (/* binding */ PyAnnoteProcessor)
|
|
18468
|
-
/* harmony export */ });
|
|
18469
|
-
/* harmony import */ var _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/processing_utils.js */ "./src/base/processing_utils.js");
|
|
18470
|
-
/* harmony import */ var _auto_feature_extraction_auto_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ../auto/feature_extraction_auto.js */ "./src/models/auto/feature_extraction_auto.js");
|
|
18471
|
-
/* harmony import */ var _utils_maths_js__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../utils/maths.js */ "./src/utils/maths.js");
|
|
18472
|
-
|
|
18473
|
-
|
|
18474
|
-
|
|
18475
|
-
|
|
18476
|
-
class PyAnnoteProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__.Processor {
|
|
18477
|
-
static feature_extractor_class = _auto_feature_extraction_auto_js__WEBPACK_IMPORTED_MODULE_1__.AutoFeatureExtractor
|
|
18478
|
-
|
|
18479
|
-
/**
|
|
18480
|
-
* Calls the feature_extractor function with the given audio input.
|
|
18481
|
-
* @param {any} audio The audio input to extract features from.
|
|
18482
|
-
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
18483
|
-
*/
|
|
18484
|
-
async _call(audio) {
|
|
18485
|
-
return await this.feature_extractor(audio)
|
|
18486
|
-
}
|
|
18487
|
-
|
|
18488
19025
|
/**
|
|
18489
19026
|
* NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
|
|
18490
19027
|
* @param {number} samples The number of frames in the audio.
|
|
@@ -18539,6 +19076,48 @@ class PyAnnoteProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODU
|
|
|
18539
19076
|
}
|
|
18540
19077
|
return results;
|
|
18541
19078
|
}
|
|
19079
|
+
|
|
19080
|
+
}
|
|
19081
|
+
|
|
19082
|
+
|
|
19083
|
+
/***/ }),
|
|
19084
|
+
|
|
19085
|
+
/***/ "./src/models/pyannote/processing_pyannote.js":
|
|
19086
|
+
/*!****************************************************!*\
|
|
19087
|
+
!*** ./src/models/pyannote/processing_pyannote.js ***!
|
|
19088
|
+
\****************************************************/
|
|
19089
|
+
/***/ ((__unused_webpack___webpack_module__, __webpack_exports__, __webpack_require__) => {
|
|
19090
|
+
|
|
19091
|
+
"use strict";
|
|
19092
|
+
__webpack_require__.r(__webpack_exports__);
|
|
19093
|
+
/* harmony export */ __webpack_require__.d(__webpack_exports__, {
|
|
19094
|
+
/* harmony export */ PyAnnoteProcessor: () => (/* binding */ PyAnnoteProcessor)
|
|
19095
|
+
/* harmony export */ });
|
|
19096
|
+
/* harmony import */ var _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! ../../base/processing_utils.js */ "./src/base/processing_utils.js");
|
|
19097
|
+
/* harmony import */ var _feature_extraction_pyannote_js__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! ./feature_extraction_pyannote.js */ "./src/models/pyannote/feature_extraction_pyannote.js");
|
|
19098
|
+
|
|
19099
|
+
|
|
19100
|
+
|
|
19101
|
+
class PyAnnoteProcessor extends _base_processing_utils_js__WEBPACK_IMPORTED_MODULE_0__.Processor {
|
|
19102
|
+
static feature_extractor_class = _feature_extraction_pyannote_js__WEBPACK_IMPORTED_MODULE_1__.PyAnnoteFeatureExtractor
|
|
19103
|
+
|
|
19104
|
+
/**
|
|
19105
|
+
* Calls the feature_extractor function with the given audio input.
|
|
19106
|
+
* @param {any} audio The audio input to extract features from.
|
|
19107
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
19108
|
+
*/
|
|
19109
|
+
async _call(audio) {
|
|
19110
|
+
return await this.feature_extractor(audio)
|
|
19111
|
+
}
|
|
19112
|
+
|
|
19113
|
+
/** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
|
|
19114
|
+
post_process_speaker_diarization(...args) {
|
|
19115
|
+
return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
|
|
19116
|
+
}
|
|
19117
|
+
|
|
19118
|
+
get sampling_rate() {
|
|
19119
|
+
return this.feature_extractor.config.sampling_rate;
|
|
19120
|
+
}
|
|
18542
19121
|
}
|
|
18543
19122
|
|
|
18544
19123
|
|
|
@@ -20288,6 +20867,17 @@ class TensorOpRegistry {
|
|
|
20288
20867
|
}
|
|
20289
20868
|
return this._top_k;
|
|
20290
20869
|
}
|
|
20870
|
+
|
|
20871
|
+
static get slice() {
|
|
20872
|
+
if (!this._slice) {
|
|
20873
|
+
this._slice = wrap(
|
|
20874
|
+
[8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
|
|
20875
|
+
this.session_options,
|
|
20876
|
+
'y',
|
|
20877
|
+
)
|
|
20878
|
+
}
|
|
20879
|
+
return this._slice;
|
|
20880
|
+
}
|
|
20291
20881
|
}
|
|
20292
20882
|
|
|
20293
20883
|
|
|
@@ -20980,7 +21570,7 @@ class FillMaskPipeline extends (/** @type {new (options: TextPipelineConstructor
|
|
|
20980
21570
|
return {
|
|
20981
21571
|
score: values[i],
|
|
20982
21572
|
token: Number(x),
|
|
20983
|
-
token_str: this.tokenizer.
|
|
21573
|
+
token_str: this.tokenizer.decode([x]),
|
|
20984
21574
|
sequence: this.tokenizer.decode(sequence, { skip_special_tokens: true }),
|
|
20985
21575
|
}
|
|
20986
21576
|
}));
|
|
@@ -22021,6 +22611,8 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22021
22611
|
case 'unispeech-sat':
|
|
22022
22612
|
case 'hubert':
|
|
22023
22613
|
return this._call_wav2vec2(audio, kwargs)
|
|
22614
|
+
case 'moonshine':
|
|
22615
|
+
return this._call_moonshine(audio, kwargs)
|
|
22024
22616
|
default:
|
|
22025
22617
|
throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
|
|
22026
22618
|
}
|
|
@@ -22174,6 +22766,34 @@ class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options: TextA
|
|
|
22174
22766
|
}
|
|
22175
22767
|
return single ? toReturn[0] : toReturn;
|
|
22176
22768
|
}
|
|
22769
|
+
|
|
22770
|
+
/**
|
|
22771
|
+
* @type {AutomaticSpeechRecognitionPipelineCallback}
|
|
22772
|
+
* @private
|
|
22773
|
+
*/
|
|
22774
|
+
async _call_moonshine(audio, kwargs) {
|
|
22775
|
+
const single = !Array.isArray(audio);
|
|
22776
|
+
if (single) {
|
|
22777
|
+
audio = [/** @type {AudioInput} */ (audio)];
|
|
22778
|
+
}
|
|
22779
|
+
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
22780
|
+
const preparedAudios = await prepareAudios(audio, sampling_rate);
|
|
22781
|
+
const toReturn = [];
|
|
22782
|
+
for (const aud of preparedAudios) {
|
|
22783
|
+
const inputs = await this.processor(aud);
|
|
22784
|
+
|
|
22785
|
+
// According to the [paper](https://arxiv.org/pdf/2410.15608):
|
|
22786
|
+
// "We use greedy decoding, with a heuristic limit of 6 output tokens
|
|
22787
|
+
// per second of audio to avoid repeated output sequences."
|
|
22788
|
+
const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
|
|
22789
|
+
const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
|
|
22790
|
+
|
|
22791
|
+
const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
|
|
22792
|
+
toReturn.push({ text });
|
|
22793
|
+
}
|
|
22794
|
+
return single ? toReturn[0] : toReturn;
|
|
22795
|
+
}
|
|
22796
|
+
|
|
22177
22797
|
}
|
|
22178
22798
|
|
|
22179
22799
|
/**
|
|
@@ -32372,7 +32992,9 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
32372
32992
|
/* harmony export */ ones_like: () => (/* binding */ ones_like),
|
|
32373
32993
|
/* harmony export */ permute: () => (/* binding */ permute),
|
|
32374
32994
|
/* harmony export */ quantize_embeddings: () => (/* binding */ quantize_embeddings),
|
|
32995
|
+
/* harmony export */ rand: () => (/* binding */ rand),
|
|
32375
32996
|
/* harmony export */ rfft: () => (/* binding */ rfft),
|
|
32997
|
+
/* harmony export */ slice: () => (/* binding */ slice),
|
|
32376
32998
|
/* harmony export */ stack: () => (/* binding */ stack),
|
|
32377
32999
|
/* harmony export */ std_mean: () => (/* binding */ std_mean),
|
|
32378
33000
|
/* harmony export */ topk: () => (/* binding */ topk),
|
|
@@ -33151,8 +33773,21 @@ class Tensor {
|
|
|
33151
33773
|
if (!DataTypeMap.hasOwnProperty(type)) {
|
|
33152
33774
|
throw new Error(`Unsupported type: ${type}`);
|
|
33153
33775
|
}
|
|
33776
|
+
|
|
33777
|
+
// Handle special cases where a mapping function is needed (e.g., where one type is a bigint and the other is a number)
|
|
33778
|
+
let map_fn;
|
|
33779
|
+
const is_source_bigint = ['int64', 'uint64'].includes(this.type);
|
|
33780
|
+
const is_dest_bigint = ['int64', 'uint64'].includes(type);
|
|
33781
|
+
if (is_source_bigint && !is_dest_bigint) {
|
|
33782
|
+
// TypeError: Cannot convert a BigInt value to a number
|
|
33783
|
+
map_fn = Number;
|
|
33784
|
+
} else if (!is_source_bigint && is_dest_bigint) {
|
|
33785
|
+
// TypeError: Cannot convert [x] to a BigInt
|
|
33786
|
+
map_fn = BigInt;
|
|
33787
|
+
}
|
|
33788
|
+
|
|
33154
33789
|
// @ts-ignore
|
|
33155
|
-
return new Tensor(type, DataTypeMap[type].from(this.data), this.dims);
|
|
33790
|
+
return new Tensor(type, DataTypeMap[type].from(this.data, map_fn), this.dims);
|
|
33156
33791
|
}
|
|
33157
33792
|
}
|
|
33158
33793
|
|
|
@@ -33350,6 +33985,29 @@ async function topk(x, k) {
|
|
|
33350
33985
|
});
|
|
33351
33986
|
}
|
|
33352
33987
|
|
|
33988
|
+
|
|
33989
|
+
const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length]);
|
|
33990
|
+
/**
|
|
33991
|
+
* Slice a multidimensional float32 tensor.
|
|
33992
|
+
* @param {Tensor} data: Tensor of data to extract slices from
|
|
33993
|
+
* @param {number[]} starts: 1-D array of starting indices of corresponding axis in axes
|
|
33994
|
+
* @param {number[]} ends: 1-D array of ending indices (exclusive) of corresponding axis in axes
|
|
33995
|
+
* @param {number[]} axes: 1-D array of axes that starts and ends apply to
|
|
33996
|
+
* @param {number[]} [steps]: 1-D array of slice step of corresponding axis in axes.
|
|
33997
|
+
* @returns {Promise<Tensor>} Sliced data tensor.
|
|
33998
|
+
*/
|
|
33999
|
+
async function slice(data, starts, ends, axes, steps) {
|
|
34000
|
+
const op = await _ops_registry_js__WEBPACK_IMPORTED_MODULE_2__.TensorOpRegistry.slice;
|
|
34001
|
+
return await op({
|
|
34002
|
+
x: data,
|
|
34003
|
+
s: arrayToIndexTensor(starts),
|
|
34004
|
+
e: arrayToIndexTensor(ends),
|
|
34005
|
+
a: arrayToIndexTensor(axes),
|
|
34006
|
+
t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
|
|
34007
|
+
});
|
|
34008
|
+
}
|
|
34009
|
+
|
|
34010
|
+
|
|
33353
34011
|
/**
|
|
33354
34012
|
* Perform mean pooling of the last hidden state followed by a normalization step.
|
|
33355
34013
|
* @param {Tensor} last_hidden_state Tensor of shape [batchSize, seqLength, embedDim]
|
|
@@ -33796,6 +34454,20 @@ function zeros_like(tensor) {
|
|
|
33796
34454
|
return zeros(tensor.dims);
|
|
33797
34455
|
}
|
|
33798
34456
|
|
|
34457
|
+
/**
|
|
34458
|
+
* Returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)
|
|
34459
|
+
* @param {number[]} size A sequence of integers defining the shape of the output tensor.
|
|
34460
|
+
* @returns {Tensor} The random tensor.
|
|
34461
|
+
*/
|
|
34462
|
+
function rand(size) {
|
|
34463
|
+
const length = size.reduce((a, b) => a * b, 1);
|
|
34464
|
+
return new Tensor(
|
|
34465
|
+
"float32",
|
|
34466
|
+
Float32Array.from({ length }, () => Math.random()),
|
|
34467
|
+
size,
|
|
34468
|
+
)
|
|
34469
|
+
}
|
|
34470
|
+
|
|
33799
34471
|
/**
|
|
33800
34472
|
* Quantizes the embeddings tensor to binary or unsigned binary precision.
|
|
33801
34473
|
* @param {Tensor} tensor The tensor to quantize.
|
|
@@ -34141,6 +34813,9 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
34141
34813
|
/* harmony export */ EsmModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.EsmModel),
|
|
34142
34814
|
/* harmony export */ EsmPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.EsmPreTrainedModel),
|
|
34143
34815
|
/* harmony export */ EsmTokenizer: () => (/* reexport safe */ _tokenizers_js__WEBPACK_IMPORTED_MODULE_3__.EsmTokenizer),
|
|
34816
|
+
/* harmony export */ ExaoneForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ExaoneForCausalLM),
|
|
34817
|
+
/* harmony export */ ExaoneModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ExaoneModel),
|
|
34818
|
+
/* harmony export */ ExaonePreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ExaonePreTrainedModel),
|
|
34144
34819
|
/* harmony export */ FFT: () => (/* reexport safe */ _utils_maths_js__WEBPACK_IMPORTED_MODULE_8__.FFT),
|
|
34145
34820
|
/* harmony export */ FalconForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.FalconForCausalLM),
|
|
34146
34821
|
/* harmony export */ FalconModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.FalconModel),
|
|
@@ -34319,7 +34994,17 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
34319
34994
|
/* harmony export */ MobileViTV2Model: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MobileViTV2Model),
|
|
34320
34995
|
/* harmony export */ MobileViTV2PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MobileViTV2PreTrainedModel),
|
|
34321
34996
|
/* harmony export */ ModelOutput: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ModelOutput),
|
|
34997
|
+
/* harmony export */ ModernBertForMaskedLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ModernBertForMaskedLM),
|
|
34998
|
+
/* harmony export */ ModernBertForSequenceClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ModernBertForSequenceClassification),
|
|
34999
|
+
/* harmony export */ ModernBertForTokenClassification: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ModernBertForTokenClassification),
|
|
35000
|
+
/* harmony export */ ModernBertModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ModernBertModel),
|
|
35001
|
+
/* harmony export */ ModernBertPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.ModernBertPreTrainedModel),
|
|
34322
35002
|
/* harmony export */ Moondream1ForConditionalGeneration: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Moondream1ForConditionalGeneration),
|
|
35003
|
+
/* harmony export */ MoonshineFeatureExtractor: () => (/* reexport safe */ _models_feature_extractors_js__WEBPACK_IMPORTED_MODULE_10__.MoonshineFeatureExtractor),
|
|
35004
|
+
/* harmony export */ MoonshineForConditionalGeneration: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MoonshineForConditionalGeneration),
|
|
35005
|
+
/* harmony export */ MoonshineModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MoonshineModel),
|
|
35006
|
+
/* harmony export */ MoonshinePreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MoonshinePreTrainedModel),
|
|
35007
|
+
/* harmony export */ MoonshineProcessor: () => (/* reexport safe */ _models_processors_js__WEBPACK_IMPORTED_MODULE_16__.MoonshineProcessor),
|
|
34323
35008
|
/* harmony export */ MptForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MptForCausalLM),
|
|
34324
35009
|
/* harmony export */ MptModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MptModel),
|
|
34325
35010
|
/* harmony export */ MptPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.MptPreTrainedModel),
|
|
@@ -34371,6 +35056,10 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
34371
35056
|
/* harmony export */ Phi3ForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Phi3ForCausalLM),
|
|
34372
35057
|
/* harmony export */ Phi3Model: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Phi3Model),
|
|
34373
35058
|
/* harmony export */ Phi3PreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Phi3PreTrainedModel),
|
|
35059
|
+
/* harmony export */ Phi3VForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Phi3VForCausalLM),
|
|
35060
|
+
/* harmony export */ Phi3VImageProcessor: () => (/* reexport safe */ _models_image_processors_js__WEBPACK_IMPORTED_MODULE_13__.Phi3VImageProcessor),
|
|
35061
|
+
/* harmony export */ Phi3VPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.Phi3VPreTrainedModel),
|
|
35062
|
+
/* harmony export */ Phi3VProcessor: () => (/* reexport safe */ _models_processors_js__WEBPACK_IMPORTED_MODULE_16__.Phi3VProcessor),
|
|
34374
35063
|
/* harmony export */ PhiForCausalLM: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PhiForCausalLM),
|
|
34375
35064
|
/* harmony export */ PhiModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PhiModel),
|
|
34376
35065
|
/* harmony export */ PhiPreTrainedModel: () => (/* reexport safe */ _models_js__WEBPACK_IMPORTED_MODULE_2__.PhiPreTrainedModel),
|
|
@@ -34619,9 +35308,11 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
34619
35308
|
/* harmony export */ permute_data: () => (/* reexport safe */ _utils_maths_js__WEBPACK_IMPORTED_MODULE_8__.permute_data),
|
|
34620
35309
|
/* harmony export */ pipeline: () => (/* reexport safe */ _pipelines_js__WEBPACK_IMPORTED_MODULE_1__.pipeline),
|
|
34621
35310
|
/* harmony export */ quantize_embeddings: () => (/* reexport safe */ _utils_tensor_js__WEBPACK_IMPORTED_MODULE_7__.quantize_embeddings),
|
|
35311
|
+
/* harmony export */ rand: () => (/* reexport safe */ _utils_tensor_js__WEBPACK_IMPORTED_MODULE_7__.rand),
|
|
34622
35312
|
/* harmony export */ read_audio: () => (/* reexport safe */ _utils_audio_js__WEBPACK_IMPORTED_MODULE_5__.read_audio),
|
|
34623
35313
|
/* harmony export */ rfft: () => (/* reexport safe */ _utils_tensor_js__WEBPACK_IMPORTED_MODULE_7__.rfft),
|
|
34624
35314
|
/* harmony export */ round: () => (/* reexport safe */ _utils_maths_js__WEBPACK_IMPORTED_MODULE_8__.round),
|
|
35315
|
+
/* harmony export */ slice: () => (/* reexport safe */ _utils_tensor_js__WEBPACK_IMPORTED_MODULE_7__.slice),
|
|
34625
35316
|
/* harmony export */ softmax: () => (/* reexport safe */ _utils_maths_js__WEBPACK_IMPORTED_MODULE_8__.softmax),
|
|
34626
35317
|
/* harmony export */ spectrogram: () => (/* reexport safe */ _utils_audio_js__WEBPACK_IMPORTED_MODULE_5__.spectrogram),
|
|
34627
35318
|
/* harmony export */ stack: () => (/* reexport safe */ _utils_tensor_js__WEBPACK_IMPORTED_MODULE_7__.stack),
|