@huggingface/transformers 3.1.2 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/transformers.cjs +777 -143
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +787 -143
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -1
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -1
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -1
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +787 -143
- package/dist/transformers.mjs.map +1 -1
- package/package.json +1 -1
- package/src/base/image_processors_utils.js +3 -1
- package/src/configs.js +10 -2
- package/src/env.js +1 -1
- package/src/models/feature_extractors.js +1 -0
- package/src/models/idefics3/image_processing_idefics3.js +24 -13
- package/src/models/image_processors.js +1 -0
- package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
- package/src/models/moonshine/processing_moonshine.js +20 -0
- package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
- package/src/models/phi3_v/processing_phi3_v.js +53 -0
- package/src/models/processors.js +2 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
- package/src/models/pyannote/processing_pyannote.js +7 -54
- package/src/models.js +176 -30
- package/src/ops/registry.js +11 -0
- package/src/pipelines.js +30 -0
- package/src/utils/tensor.js +51 -1
- package/types/base/image_processors_utils.d.ts +2 -2
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/configs.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/feature_extractors.d.ts +1 -0
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
- package/types/models/moonshine/processing_moonshine.d.ts +17 -0
- package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/pyannote/processing_pyannote.d.ts +4 -15
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
- package/types/models.d.ts +33 -1
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +5 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +16 -0
- package/types/utils/tensor.d.ts.map +1 -1
package/src/models.js
CHANGED
|
@@ -131,6 +131,7 @@ const MODEL_TYPES = {
|
|
|
131
131
|
ImageTextToText: 6,
|
|
132
132
|
Musicgen: 7,
|
|
133
133
|
MultiModality: 8,
|
|
134
|
+
Phi3V: 9,
|
|
134
135
|
}
|
|
135
136
|
//////////////////////////////////////////////////
|
|
136
137
|
|
|
@@ -906,6 +907,10 @@ export class PreTrainedModel extends Callable {
|
|
|
906
907
|
this._forward = imageTextToTextForward;
|
|
907
908
|
this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
|
|
908
909
|
break;
|
|
910
|
+
case MODEL_TYPES.Phi3V:
|
|
911
|
+
this.can_generate = true;
|
|
912
|
+
this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
|
|
913
|
+
break;
|
|
909
914
|
|
|
910
915
|
case MODEL_TYPES.MultiModality:
|
|
911
916
|
this.can_generate = true;
|
|
@@ -1070,6 +1075,18 @@ export class PreTrainedModel extends Callable {
|
|
|
1070
1075
|
}, options),
|
|
1071
1076
|
]);
|
|
1072
1077
|
|
|
1078
|
+
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
1079
|
+
info = await Promise.all([
|
|
1080
|
+
constructSessions(pretrained_model_name_or_path, {
|
|
1081
|
+
prepare_inputs_embeds: 'prepare_inputs_embeds',
|
|
1082
|
+
model: 'model',
|
|
1083
|
+
vision_encoder: 'vision_encoder',
|
|
1084
|
+
}, options),
|
|
1085
|
+
getOptionalConfigs(pretrained_model_name_or_path, {
|
|
1086
|
+
generation_config: 'generation_config.json',
|
|
1087
|
+
}, options),
|
|
1088
|
+
]);
|
|
1089
|
+
|
|
1073
1090
|
} else { // should be MODEL_TYPES.EncoderOnly
|
|
1074
1091
|
if (modelType !== MODEL_TYPES.EncoderOnly) {
|
|
1075
1092
|
const type = modelName ?? config?.model_type;
|
|
@@ -3342,6 +3359,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3342
3359
|
}
|
|
3343
3360
|
//////////////////////////////////////////////////
|
|
3344
3361
|
|
|
3362
|
+
|
|
3363
|
+
//////////////////////////////////////////////////
|
|
3364
|
+
// Moonshine models
|
|
3365
|
+
export class MoonshinePreTrainedModel extends PreTrainedModel {
|
|
3366
|
+
|
|
3367
|
+
requires_attention_mask = false;
|
|
3368
|
+
main_input_name = 'input_values';
|
|
3369
|
+
forward_params = [
|
|
3370
|
+
'input_values',
|
|
3371
|
+
'decoder_input_ids',
|
|
3372
|
+
'past_key_values',
|
|
3373
|
+
];
|
|
3374
|
+
};
|
|
3375
|
+
|
|
3376
|
+
/**
|
|
3377
|
+
* MoonshineModel class for training Moonshine models without a language model head.
|
|
3378
|
+
*/
|
|
3379
|
+
export class MoonshineModel extends MoonshinePreTrainedModel { }
|
|
3380
|
+
|
|
3381
|
+
export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
|
|
3382
|
+
//////////////////////////////////////////////////
|
|
3383
|
+
|
|
3384
|
+
|
|
3345
3385
|
//////////////////////////////////////////////////
|
|
3346
3386
|
/**
|
|
3347
3387
|
* Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
|
|
@@ -3612,6 +3652,77 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
|
3612
3652
|
}
|
|
3613
3653
|
//////////////////////////////////////////////////
|
|
3614
3654
|
|
|
3655
|
+
export class Phi3VPreTrainedModel extends PreTrainedModel {
|
|
3656
|
+
forward_params = [
|
|
3657
|
+
'input_ids',
|
|
3658
|
+
'inputs_embeds',
|
|
3659
|
+
'attention_mask',
|
|
3660
|
+
'position_ids',
|
|
3661
|
+
'pixel_values',
|
|
3662
|
+
'image_sizes',
|
|
3663
|
+
'past_key_values',
|
|
3664
|
+
];
|
|
3665
|
+
}
|
|
3666
|
+
export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
|
|
3667
|
+
|
|
3668
|
+
async forward({
|
|
3669
|
+
// Produced by the tokenizer/processor:
|
|
3670
|
+
input_ids = null,
|
|
3671
|
+
attention_mask = null,
|
|
3672
|
+
pixel_values = null,
|
|
3673
|
+
image_sizes = null,
|
|
3674
|
+
|
|
3675
|
+
// Used during generation:
|
|
3676
|
+
position_ids = null,
|
|
3677
|
+
inputs_embeds = null,
|
|
3678
|
+
past_key_values = null,
|
|
3679
|
+
|
|
3680
|
+
// Generic generation parameters
|
|
3681
|
+
generation_config = null,
|
|
3682
|
+
logits_processor = null,
|
|
3683
|
+
|
|
3684
|
+
// TODO: needed?
|
|
3685
|
+
...kwargs
|
|
3686
|
+
}) {
|
|
3687
|
+
if (!inputs_embeds) {
|
|
3688
|
+
let image_features;
|
|
3689
|
+
if (pixel_values && input_ids.dims[1] !== 1) {
|
|
3690
|
+
if (!image_sizes) {
|
|
3691
|
+
throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
|
|
3692
|
+
}
|
|
3693
|
+
|
|
3694
|
+
// Encode the image
|
|
3695
|
+
({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
|
|
3696
|
+
pixel_values,
|
|
3697
|
+
image_sizes,
|
|
3698
|
+
}));
|
|
3699
|
+
} else {
|
|
3700
|
+
const hidden_size = this.config.normalized_config.hidden_size;
|
|
3701
|
+
image_features = new Tensor(
|
|
3702
|
+
'float32',
|
|
3703
|
+
[],
|
|
3704
|
+
[0, hidden_size],
|
|
3705
|
+
);
|
|
3706
|
+
}
|
|
3707
|
+
|
|
3708
|
+
({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
|
|
3709
|
+
input_ids,
|
|
3710
|
+
image_features,
|
|
3711
|
+
}));
|
|
3712
|
+
}
|
|
3713
|
+
|
|
3714
|
+
const outputs = await decoderForward(this, {
|
|
3715
|
+
inputs_embeds,
|
|
3716
|
+
past_key_values,
|
|
3717
|
+
attention_mask,
|
|
3718
|
+
position_ids,
|
|
3719
|
+
generation_config,
|
|
3720
|
+
logits_processor,
|
|
3721
|
+
}, false);
|
|
3722
|
+
return outputs;
|
|
3723
|
+
}
|
|
3724
|
+
}
|
|
3725
|
+
|
|
3615
3726
|
//////////////////////////////////////////////////
|
|
3616
3727
|
export class CLIPPreTrainedModel extends PreTrainedModel { }
|
|
3617
3728
|
|
|
@@ -3666,9 +3777,11 @@ export class CLIPModel extends CLIPPreTrainedModel { }
|
|
|
3666
3777
|
export class CLIPTextModel extends CLIPPreTrainedModel {
|
|
3667
3778
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3668
3779
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3669
|
-
|
|
3670
|
-
|
|
3671
|
-
|
|
3780
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3781
|
+
// Update default model file name if not provided
|
|
3782
|
+
model_file_name: 'text_model',
|
|
3783
|
+
...options,
|
|
3784
|
+
});
|
|
3672
3785
|
}
|
|
3673
3786
|
}
|
|
3674
3787
|
|
|
@@ -3701,9 +3814,11 @@ export class CLIPTextModel extends CLIPPreTrainedModel {
|
|
|
3701
3814
|
export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
3702
3815
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3703
3816
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3704
|
-
|
|
3705
|
-
|
|
3706
|
-
|
|
3817
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3818
|
+
// Update default model file name if not provided
|
|
3819
|
+
model_file_name: 'text_model',
|
|
3820
|
+
...options,
|
|
3821
|
+
});
|
|
3707
3822
|
}
|
|
3708
3823
|
}
|
|
3709
3824
|
|
|
@@ -3713,9 +3828,11 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
|
3713
3828
|
export class CLIPVisionModel extends CLIPPreTrainedModel {
|
|
3714
3829
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3715
3830
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3716
|
-
|
|
3717
|
-
|
|
3718
|
-
|
|
3831
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3832
|
+
// Update default model file name if not provided
|
|
3833
|
+
model_file_name: 'vision_model',
|
|
3834
|
+
...options,
|
|
3835
|
+
});
|
|
3719
3836
|
}
|
|
3720
3837
|
}
|
|
3721
3838
|
|
|
@@ -3748,9 +3865,11 @@ export class CLIPVisionModel extends CLIPPreTrainedModel {
|
|
|
3748
3865
|
export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
|
|
3749
3866
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3750
3867
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3751
|
-
|
|
3752
|
-
|
|
3753
|
-
|
|
3868
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3869
|
+
// Update default model file name if not provided
|
|
3870
|
+
model_file_name: 'vision_model',
|
|
3871
|
+
...options,
|
|
3872
|
+
});
|
|
3754
3873
|
}
|
|
3755
3874
|
}
|
|
3756
3875
|
//////////////////////////////////////////////////
|
|
@@ -3834,9 +3953,11 @@ export class SiglipModel extends SiglipPreTrainedModel { }
|
|
|
3834
3953
|
export class SiglipTextModel extends SiglipPreTrainedModel {
|
|
3835
3954
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3836
3955
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3837
|
-
|
|
3838
|
-
|
|
3839
|
-
|
|
3956
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3957
|
+
// Update default model file name if not provided
|
|
3958
|
+
model_file_name: 'text_model',
|
|
3959
|
+
...options,
|
|
3960
|
+
});
|
|
3840
3961
|
}
|
|
3841
3962
|
}
|
|
3842
3963
|
|
|
@@ -3869,9 +3990,11 @@ export class SiglipTextModel extends SiglipPreTrainedModel {
|
|
|
3869
3990
|
export class SiglipVisionModel extends CLIPPreTrainedModel {
|
|
3870
3991
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3871
3992
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
|
|
3993
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3994
|
+
// Update default model file name if not provided
|
|
3995
|
+
model_file_name: 'vision_model',
|
|
3996
|
+
...options,
|
|
3997
|
+
});
|
|
3875
3998
|
}
|
|
3876
3999
|
}
|
|
3877
4000
|
//////////////////////////////////////////////////
|
|
@@ -3926,18 +4049,22 @@ export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
|
|
|
3926
4049
|
export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
|
|
3927
4050
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3928
4051
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3929
|
-
|
|
3930
|
-
|
|
3931
|
-
|
|
4052
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
4053
|
+
// Update default model file name if not provided
|
|
4054
|
+
model_file_name: 'text_model',
|
|
4055
|
+
...options,
|
|
4056
|
+
});
|
|
3932
4057
|
}
|
|
3933
4058
|
}
|
|
3934
4059
|
|
|
3935
4060
|
export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
|
|
3936
4061
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3937
4062
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
4063
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
4064
|
+
// Update default model file name if not provided
|
|
4065
|
+
model_file_name: 'vision_model',
|
|
4066
|
+
...options,
|
|
4067
|
+
});
|
|
3941
4068
|
}
|
|
3942
4069
|
}
|
|
3943
4070
|
//////////////////////////////////////////////////
|
|
@@ -4097,6 +4224,14 @@ export class LlamaForCausalLM extends LlamaPreTrainedModel { }
|
|
|
4097
4224
|
//////////////////////////////////////////////////
|
|
4098
4225
|
|
|
4099
4226
|
|
|
4227
|
+
//////////////////////////////////////////////////
|
|
4228
|
+
// EXAONE models
|
|
4229
|
+
export class ExaonePreTrainedModel extends PreTrainedModel { }
|
|
4230
|
+
export class ExaoneModel extends ExaonePreTrainedModel { }
|
|
4231
|
+
export class ExaoneForCausalLM extends ExaonePreTrainedModel { }
|
|
4232
|
+
//////////////////////////////////////////////////
|
|
4233
|
+
|
|
4234
|
+
|
|
4100
4235
|
//////////////////////////////////////////////////
|
|
4101
4236
|
// MobileLLM models
|
|
4102
4237
|
export class MobileLLMPreTrainedModel extends PreTrainedModel { }
|
|
@@ -6159,9 +6294,11 @@ export class ClapModel extends ClapPreTrainedModel { }
|
|
|
6159
6294
|
export class ClapTextModelWithProjection extends ClapPreTrainedModel {
|
|
6160
6295
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
6161
6296
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
6162
|
-
|
|
6163
|
-
|
|
6164
|
-
|
|
6297
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
6298
|
+
// Update default model file name if not provided
|
|
6299
|
+
model_file_name: 'text_model',
|
|
6300
|
+
...options,
|
|
6301
|
+
});
|
|
6165
6302
|
}
|
|
6166
6303
|
}
|
|
6167
6304
|
|
|
@@ -6194,9 +6331,11 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel {
|
|
|
6194
6331
|
export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
|
|
6195
6332
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
6196
6333
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
6197
|
-
|
|
6198
|
-
|
|
6199
|
-
|
|
6334
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
6335
|
+
// Update default model file name if not provided
|
|
6336
|
+
model_file_name: 'audio_model',
|
|
6337
|
+
...options,
|
|
6338
|
+
});
|
|
6200
6339
|
}
|
|
6201
6340
|
}
|
|
6202
6341
|
//////////////////////////////////////////////////
|
|
@@ -6883,6 +7022,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
6883
7022
|
['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
|
|
6884
7023
|
['codegen', ['CodeGenModel', CodeGenModel]],
|
|
6885
7024
|
['llama', ['LlamaModel', LlamaModel]],
|
|
7025
|
+
['exaone', ['ExaoneModel', ExaoneModel]],
|
|
6886
7026
|
['olmo', ['OlmoModel', OlmoModel]],
|
|
6887
7027
|
['olmo2', ['Olmo2Model', Olmo2Model]],
|
|
6888
7028
|
['mobilellm', ['MobileLLMModel', MobileLLMModel]],
|
|
@@ -6905,6 +7045,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
6905
7045
|
const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
|
|
6906
7046
|
['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
|
|
6907
7047
|
['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
|
|
7048
|
+
['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
|
|
6908
7049
|
]);
|
|
6909
7050
|
|
|
6910
7051
|
const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
|
|
@@ -6975,6 +7116,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
|
|
|
6975
7116
|
['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
|
|
6976
7117
|
['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
|
|
6977
7118
|
['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
|
|
7119
|
+
['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
|
|
6978
7120
|
['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
|
|
6979
7121
|
['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
|
|
6980
7122
|
['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
|
|
@@ -6994,6 +7136,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
|
|
|
6994
7136
|
['falcon', ['FalconForCausalLM', FalconForCausalLM]],
|
|
6995
7137
|
['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
|
|
6996
7138
|
['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
|
|
7139
|
+
|
|
7140
|
+
// Also image-text-to-text
|
|
7141
|
+
['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
|
|
6997
7142
|
]);
|
|
6998
7143
|
|
|
6999
7144
|
const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
|
|
@@ -7231,6 +7376,7 @@ const CUSTOM_MAPPING = [
|
|
|
7231
7376
|
// OVERRIDE:
|
|
7232
7377
|
// TODO: Refactor to allow class to specify model
|
|
7233
7378
|
['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
|
|
7379
|
+
['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
|
|
7234
7380
|
|
|
7235
7381
|
['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
|
|
7236
7382
|
['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
|
package/src/ops/registry.js
CHANGED
|
@@ -100,4 +100,15 @@ export class TensorOpRegistry {
|
|
|
100
100
|
}
|
|
101
101
|
return this._top_k;
|
|
102
102
|
}
|
|
103
|
+
|
|
104
|
+
static get slice() {
|
|
105
|
+
if (!this._slice) {
|
|
106
|
+
this._slice = wrap(
|
|
107
|
+
[8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
|
|
108
|
+
this.session_options,
|
|
109
|
+
'y',
|
|
110
|
+
)
|
|
111
|
+
}
|
|
112
|
+
return this._slice;
|
|
113
|
+
}
|
|
103
114
|
}
|
package/src/pipelines.js
CHANGED
|
@@ -1729,6 +1729,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1729
1729
|
case 'unispeech-sat':
|
|
1730
1730
|
case 'hubert':
|
|
1731
1731
|
return this._call_wav2vec2(audio, kwargs)
|
|
1732
|
+
case 'moonshine':
|
|
1733
|
+
return this._call_moonshine(audio, kwargs)
|
|
1732
1734
|
default:
|
|
1733
1735
|
throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
|
|
1734
1736
|
}
|
|
@@ -1882,6 +1884,34 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1882
1884
|
}
|
|
1883
1885
|
return single ? toReturn[0] : toReturn;
|
|
1884
1886
|
}
|
|
1887
|
+
|
|
1888
|
+
/**
|
|
1889
|
+
* @type {AutomaticSpeechRecognitionPipelineCallback}
|
|
1890
|
+
* @private
|
|
1891
|
+
*/
|
|
1892
|
+
async _call_moonshine(audio, kwargs) {
|
|
1893
|
+
const single = !Array.isArray(audio);
|
|
1894
|
+
if (single) {
|
|
1895
|
+
audio = [/** @type {AudioInput} */ (audio)];
|
|
1896
|
+
}
|
|
1897
|
+
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
1898
|
+
const preparedAudios = await prepareAudios(audio, sampling_rate);
|
|
1899
|
+
const toReturn = [];
|
|
1900
|
+
for (const aud of preparedAudios) {
|
|
1901
|
+
const inputs = await this.processor(aud);
|
|
1902
|
+
|
|
1903
|
+
// According to the [paper](https://arxiv.org/pdf/2410.15608):
|
|
1904
|
+
// "We use greedy decoding, with a heuristic limit of 6 output tokens
|
|
1905
|
+
// per second of audio to avoid repeated output sequences."
|
|
1906
|
+
const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
|
|
1907
|
+
const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
|
|
1908
|
+
|
|
1909
|
+
const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
|
|
1910
|
+
toReturn.push({ text });
|
|
1911
|
+
}
|
|
1912
|
+
return single ? toReturn[0] : toReturn;
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1885
1915
|
}
|
|
1886
1916
|
|
|
1887
1917
|
/**
|
package/src/utils/tensor.js
CHANGED
|
@@ -772,8 +772,21 @@ export class Tensor {
|
|
|
772
772
|
if (!DataTypeMap.hasOwnProperty(type)) {
|
|
773
773
|
throw new Error(`Unsupported type: ${type}`);
|
|
774
774
|
}
|
|
775
|
+
|
|
776
|
+
// Handle special cases where a mapping function is needed (e.g., where one type is a bigint and the other is a number)
|
|
777
|
+
let map_fn;
|
|
778
|
+
const is_source_bigint = ['int64', 'uint64'].includes(this.type);
|
|
779
|
+
const is_dest_bigint = ['int64', 'uint64'].includes(type);
|
|
780
|
+
if (is_source_bigint && !is_dest_bigint) {
|
|
781
|
+
// TypeError: Cannot convert a BigInt value to a number
|
|
782
|
+
map_fn = Number;
|
|
783
|
+
} else if (!is_source_bigint && is_dest_bigint) {
|
|
784
|
+
// TypeError: Cannot convert [x] to a BigInt
|
|
785
|
+
map_fn = BigInt;
|
|
786
|
+
}
|
|
787
|
+
|
|
775
788
|
// @ts-ignore
|
|
776
|
-
return new Tensor(type, DataTypeMap[type].from(this.data), this.dims);
|
|
789
|
+
return new Tensor(type, DataTypeMap[type].from(this.data, map_fn), this.dims);
|
|
777
790
|
}
|
|
778
791
|
}
|
|
779
792
|
|
|
@@ -971,6 +984,29 @@ export async function topk(x, k) {
|
|
|
971
984
|
});
|
|
972
985
|
}
|
|
973
986
|
|
|
987
|
+
|
|
988
|
+
const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length]);
|
|
989
|
+
/**
|
|
990
|
+
* Slice a multidimensional float32 tensor.
|
|
991
|
+
* @param {Tensor} data: Tensor of data to extract slices from
|
|
992
|
+
* @param {number[]} starts: 1-D array of starting indices of corresponding axis in axes
|
|
993
|
+
* @param {number[]} ends: 1-D array of ending indices (exclusive) of corresponding axis in axes
|
|
994
|
+
* @param {number[]} axes: 1-D array of axes that starts and ends apply to
|
|
995
|
+
* @param {number[]} [steps]: 1-D array of slice step of corresponding axis in axes.
|
|
996
|
+
* @returns {Promise<Tensor>} Sliced data tensor.
|
|
997
|
+
*/
|
|
998
|
+
export async function slice(data, starts, ends, axes, steps) {
|
|
999
|
+
const op = await TensorOpRegistry.slice;
|
|
1000
|
+
return await op({
|
|
1001
|
+
x: data,
|
|
1002
|
+
s: arrayToIndexTensor(starts),
|
|
1003
|
+
e: arrayToIndexTensor(ends),
|
|
1004
|
+
a: arrayToIndexTensor(axes),
|
|
1005
|
+
t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
|
|
1006
|
+
});
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
|
|
974
1010
|
/**
|
|
975
1011
|
* Perform mean pooling of the last hidden state followed by a normalization step.
|
|
976
1012
|
* @param {Tensor} last_hidden_state Tensor of shape [batchSize, seqLength, embedDim]
|
|
@@ -1417,6 +1453,20 @@ export function zeros_like(tensor) {
|
|
|
1417
1453
|
return zeros(tensor.dims);
|
|
1418
1454
|
}
|
|
1419
1455
|
|
|
1456
|
+
/**
|
|
1457
|
+
* Returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)
|
|
1458
|
+
* @param {number[]} size A sequence of integers defining the shape of the output tensor.
|
|
1459
|
+
* @returns {Tensor} The random tensor.
|
|
1460
|
+
*/
|
|
1461
|
+
export function rand(size) {
|
|
1462
|
+
const length = size.reduce((a, b) => a * b, 1);
|
|
1463
|
+
return new Tensor(
|
|
1464
|
+
"float32",
|
|
1465
|
+
Float32Array.from({ length }, () => Math.random()),
|
|
1466
|
+
size,
|
|
1467
|
+
)
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1420
1470
|
/**
|
|
1421
1471
|
* Quantizes the embeddings tensor to binary or unsigned binary precision.
|
|
1422
1472
|
* @param {Tensor} tensor The tensor to quantize.
|
|
@@ -149,7 +149,7 @@ export class ImageProcessor extends ImageProcessor_base {
|
|
|
149
149
|
* Pad the image by a certain amount.
|
|
150
150
|
* @param {Float32Array} pixelData The pixel data to pad.
|
|
151
151
|
* @param {number[]} imgDims The dimensions of the image (height, width, channels).
|
|
152
|
-
* @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
|
|
152
|
+
* @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
|
|
153
153
|
* @param {Object} options The options for padding.
|
|
154
154
|
* @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
|
|
155
155
|
* @param {boolean} [options.center=false] Whether to center the image.
|
|
@@ -159,7 +159,7 @@ export class ImageProcessor extends ImageProcessor_base {
|
|
|
159
159
|
pad_image(pixelData: Float32Array, imgDims: number[], padSize: {
|
|
160
160
|
width: number;
|
|
161
161
|
height: number;
|
|
162
|
-
} | number, { mode, center, constant_values, }?: {
|
|
162
|
+
} | number | "square", { mode, center, constant_values, }?: {
|
|
163
163
|
mode?: "constant" | "symmetric";
|
|
164
164
|
center?: boolean;
|
|
165
165
|
constant_values?: number | number[];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"image_processors_utils.d.ts","sourceRoot":"","sources":["../../src/base/image_processors_utils.js"],"names":[],"mappings":"AA+EA;;;;;;;;;GASG;AACH,uDAPG;IAAwB,MAAM,EAAtB,MAAM;IACU,UAAU,EAA1B,MAAM;CACd,cAAQ,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,iBAClB,OAAO,GACN,KAAQ,CAwEnB;AAGD;;;;;;GAMG;AACH,4DALW,GAAC,iBACD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB;IAAC,YAAY,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,EAAE,CAAA;CAAC,EAAE,CAwDtD;AAkPD;;;;;;;;;GASG;AACH,4DARW,GAAC,cACD,MAAM,mBACN,MAAM,gCACN,MAAM,sBACN,GAAG,CAAC,MAAM,CAAC,iBACX,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAChB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAuE/G;AAGD;;;;;;;GAOG;AACH,4DANW,GAAC,cACD,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAI/G;;KA3iBsC,GAAG;UAAyB,GACnE;;AA6iBA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH;
|
|
1
|
+
{"version":3,"file":"image_processors_utils.d.ts","sourceRoot":"","sources":["../../src/base/image_processors_utils.js"],"names":[],"mappings":"AA+EA;;;;;;;;;GASG;AACH,uDAPG;IAAwB,MAAM,EAAtB,MAAM;IACU,UAAU,EAA1B,MAAM;CACd,cAAQ,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,iBAClB,OAAO,GACN,KAAQ,CAwEnB;AAGD;;;;;;GAMG;AACH,4DALW,GAAC,iBACD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB;IAAC,YAAY,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,EAAE,CAAA;CAAC,EAAE,CAwDtD;AAkPD;;;;;;;;;GASG;AACH,4DARW,GAAC,cACD,MAAM,mBACN,MAAM,gCACN,MAAM,sBACN,GAAG,CAAC,MAAM,CAAC,iBACX,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAChB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAuE/G;AAGD;;;;;;;GAOG;AACH,4DANW,GAAC,cACD,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAI/G;;KA3iBsC,GAAG;UAAyB,GACnE;;AA6iBA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH;IAqeI;;;;;;;;;;;;;;OAcG;IACH,sDATW,MAAM,WAKN,OAAO,iBAAiB,EAAE,iBAAiB,GAEzC,OAAO,CAAC,cAAc,CAAC,CAKnC;IArfD;;;OAGG;IACH,oBAFW,oBAAoB,EAmC9B;IA9BG,qBAAkD;IAClD,oBAA+C;IAE/C,iBAAoC;IACpC,oBAA2C;IAC3C,uBAAwD;IACxD,sBAAuC;IAEvC,sBAAuC;IACvC,UAA4C;IAC5C,mBAA8D;IAC9D,uBAAwE;IAExE,wBAA2C;IAC3C,eAAiC;IACjC,oBAAmD;IACnD,oBAA2C;IAE3C,cAA+B;IAC/B,YAA2B;IAQ3B,+BAAkE;IAElE,6BAAoB;IAGxB;;;;;;;OAOG;IACH,iBALW,QAAQ,QACR;QAAC,MAAM,EAAC,MAAM,CAAC;QAAC,KAAK,EAAC,MAAM,CAAA;KAAC,aAC7B,MAAM,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAC5B,OAAO,CAAC,QAAQ,CAAC,CAsB7B;IAGD;;;;;OAKG;IACH,mBAJW,QAAQ,mBACR,MAAM,GACJ,OAAO,CAAC,QAAQ,CAAC,CAiC7B;IAED;;;;;;;;;;OAUG;IACH,qBATW,YAAY,WACZ,MAAM,EAAE,WACR;QAAC,KAAK,EAAC,MAAM,CAAC;QAAC,MAAM,EAAC,MAAM,CAAA;KAAC,GAAC,MAAM,GAAC,QAAQ,uCAErD;QAAyC,IAAI,GAArC,UAAU,GAAC,WAAW;QACJ,MAAM,GAAxB,OAAO;QACmB,eAAe,GAAzC,MAAM,GAAC,MAAM,EAAE;KACvB,GAAU,CAAC,YAAY,EAAE,MAAM,EAAE,CAAC,CA+EpC;IAED;;;;OAIG;IACH,mBAHW,YAAY,GACV,IAAI,CAMhB;IAED;;;;;;OAMG;IACH,oCAJW,QAAQ,QACR,GAAG,GACD,CAAC,MAAM,EAAE,MAAM,CAAC,CA4F5B;IAED;;;;OAIG;IACH,cAHW,QAAQ,GACN,OAAO,CAAC,QAAQ,CAAC,CAO7B;IAED;;;;;OAKG;IAEH;;;;;;OAMG;IACH,kBAJW,QAAQ,iGAEN,OAAO;;;;uBAVN,WAAW;;;;6BACX,WAAW;;;;sBACX,MAAM;MAQmB,CAsHtC;IAED;;;;;;;OAOG;IACH,cAJW,QAAQ,EAAE,WACP,GAAG,EAAA,GACJ,OAAO,CAAC,oBAAoB,CAAC,CAqBzC;CAsBJ;;;;;0BAvjCY,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC;;;;;kBAM9B,MAAM;;;;oBACN,WAAW,EAAE;;;;0BACb,WAAW,EAAE;;;;;;;;;;;;;iBAgiBb,MAAM,EAAE;;;;gBACR,MAAM,EAAE;;;;iBACR,OAAO;;;;qBACP,MAAM;;;;mBACN,OAAO;;;;gBACP,OAAO;;;;eACP,MAAM;;;;WACN,MAAM,MAAO;;;;iBACb,MAAM,MAAO;;;;;4BACb,OAAO;;;;;qBAEP,OAAO;;;;mBAEP,OAAO;;;;;wBACP,OAAO;;;;;yBAEP,MAAM;;;;WAGN,MAAM,EAAE;;;;UACR,MAAM,EAAE;;uBAtkBqB,oBAAoB;yBAEtC,mBAAmB"}
|
package/types/configs.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"AA2OA;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;gCA7VY,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;qBA0VrC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"image_processing_auto.d.ts","sourceRoot":"","sources":["../../../src/models/auto/image_processing_auto.js"],"names":[],"mappings":"AAMA;
|
|
1
|
+
{"version":3,"file":"image_processing_auto.d.ts","sourceRoot":"","sources":["../../../src/models/auto/image_processing_auto.js"],"names":[],"mappings":"AAMA;kFAuB4jyC,oBAAiB;CAD5kyC;+BAzB8B,sCAAsC"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export * from "./audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js";
|
|
2
2
|
export * from "./clap/feature_extraction_clap.js";
|
|
3
|
+
export * from "./moonshine/feature_extraction_moonshine.js";
|
|
3
4
|
export * from "./pyannote/feature_extraction_pyannote.js";
|
|
4
5
|
export * from "./seamless_m4t/feature_extraction_seamless_m4t.js";
|
|
5
6
|
export * from "./speecht5/feature_extraction_speecht5.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"image_processing_idefics3.d.ts","sourceRoot":"","sources":["../../../src/models/idefics3/image_processing_idefics3.js"],"names":[],"mappings":"AAOA;IACI,yBAKC;IAFG,wBAA2D;IAC3D,oBAA2C;IAG/C;;;OAGG;IAEH;;;;;OAKG;IACH,6GAHW,MAAM;;;MAiBhB;IAED,uDAAuD;IACvD,cADY,0CAAS,yCAAU,GAAC,yCAAU,EAAE;;;;;;;;;;OA4H3C;IAED;;;;;;
|
|
1
|
+
{"version":3,"file":"image_processing_idefics3.d.ts","sourceRoot":"","sources":["../../../src/models/idefics3/image_processing_idefics3.js"],"names":[],"mappings":"AAOA;IACI,yBAKC;IAFG,wBAA2D;IAC3D,oBAA2C;IAG/C;;;OAGG;IAEH;;;;;OAKG;IACH,6GAHW,MAAM;;;MAiBhB;IAED,uDAAuD;IACvD,cADY,0CAAS,yCAAU,GAAC,yCAAU,EAAE;;;;;;;;;;OA4H3C;IAED;;;;;;OA4DC;CACJ;+BAjOM,sCAAsC"}
|
|
@@ -23,6 +23,7 @@ export * from "./mobilevit/image_processing_mobilevit.js";
|
|
|
23
23
|
export * from "./nougat/image_processing_nougat.js";
|
|
24
24
|
export * from "./owlv2/image_processing_owlv2.js";
|
|
25
25
|
export * from "./owlvit/image_processing_owlvit.js";
|
|
26
|
+
export * from "./phi3_v/image_processing_phi3_v.js";
|
|
26
27
|
export * from "./pvt/image_processing_pvt.js";
|
|
27
28
|
export * from "./qwen2_vl/image_processing_qwen2_vl.js";
|
|
28
29
|
export * from "./rt_detr/image_processing_rt_detr.js";
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export class MoonshineFeatureExtractor extends FeatureExtractor {
|
|
2
|
+
/**
|
|
3
|
+
* Asynchronously extracts input values from a given audio using the provided configuration.
|
|
4
|
+
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
5
|
+
* @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
|
|
6
|
+
*/
|
|
7
|
+
_call(audio: Float32Array | Float64Array): Promise<{
|
|
8
|
+
input_values: Tensor;
|
|
9
|
+
}>;
|
|
10
|
+
}
|
|
11
|
+
import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
|
|
12
|
+
import { Tensor } from '../../utils/tensor.js';
|
|
13
|
+
//# sourceMappingURL=feature_extraction_moonshine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"feature_extraction_moonshine.d.ts","sourceRoot":"","sources":["../../../src/models/moonshine/feature_extraction_moonshine.js"],"names":[],"mappings":"AAIA;IACI;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC;QAAE,YAAY,EAAE,MAAM,CAAC;KAAE,CAAC,CAgB9C;CACJ;iCAzBuD,wCAAwC;uBACzE,uBAAuB"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Represents a MoonshineProcessor that extracts features from an audio input.
|
|
3
|
+
*/
|
|
4
|
+
export class MoonshineProcessor extends Processor {
|
|
5
|
+
static tokenizer_class: typeof AutoTokenizer;
|
|
6
|
+
static feature_extractor_class: typeof AutoFeatureExtractor;
|
|
7
|
+
/**
|
|
8
|
+
* Calls the feature_extractor function with the given audio input.
|
|
9
|
+
* @param {any} audio The audio input to extract features from.
|
|
10
|
+
* @returns {Promise<any>} A Promise that resolves with the extracted features.
|
|
11
|
+
*/
|
|
12
|
+
_call(audio: any): Promise<any>;
|
|
13
|
+
}
|
|
14
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
15
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
16
|
+
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
|
|
17
|
+
//# sourceMappingURL=processing_moonshine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processing_moonshine.d.ts","sourceRoot":"","sources":["../../../src/models/moonshine/processing_moonshine.js"],"names":[],"mappings":"AAIA;;GAEG;AACH;IACI,6CAAsC;IACtC,4DAAqD;IAErD;;;;OAIG;IACH,aAHW,GAAG,GACD,OAAO,CAAC,GAAG,CAAC,CAIxB;CACJ;0BAjByB,gCAAgC;8BAD5B,qBAAqB;qCADd,oCAAoC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export class Phi3VImageProcessor extends ImageProcessor {
|
|
2
|
+
constructor(config: any);
|
|
3
|
+
_num_crops: any;
|
|
4
|
+
calc_num_image_tokens_from_image_size(width: any, height: any): number;
|
|
5
|
+
_call(images: any, { num_crops, }?: {
|
|
6
|
+
num_crops?: any;
|
|
7
|
+
}): Promise<{
|
|
8
|
+
pixel_values: Tensor;
|
|
9
|
+
original_sizes: any[];
|
|
10
|
+
reshaped_input_sizes: any[];
|
|
11
|
+
image_sizes: Tensor;
|
|
12
|
+
num_img_tokens: number[];
|
|
13
|
+
}>;
|
|
14
|
+
}
|
|
15
|
+
import { ImageProcessor } from "../../base/image_processors_utils.js";
|
|
16
|
+
import { Tensor } from "../../utils/tensor.js";
|
|
17
|
+
//# sourceMappingURL=image_processing_phi3_v.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image_processing_phi3_v.d.ts","sourceRoot":"","sources":["../../../src/models/phi3_v/image_processing_phi3_v.js"],"names":[],"mappings":"AASA;IACI,yBAWC;IADG,gBAAkC;IAEtC,uEAIC;IA0CD;;;;;;;;OA6FC;CACJ;+BAhKM,sCAAsC;uBACa,uBAAuB"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export class Phi3VProcessor extends Processor {
|
|
2
|
+
static image_processor_class: typeof AutoImageProcessor;
|
|
3
|
+
static tokenizer_class: typeof AutoTokenizer;
|
|
4
|
+
/**
|
|
5
|
+
*
|
|
6
|
+
* @param {string|string[]} text
|
|
7
|
+
* @param {RawImage|RawImage[]} images
|
|
8
|
+
* @param {...any} args
|
|
9
|
+
* @returns {Promise<any>}
|
|
10
|
+
*/
|
|
11
|
+
_call(text: string | string[], images?: RawImage | RawImage[], { padding, truncation, num_crops, }?: any[]): Promise<any>;
|
|
12
|
+
}
|
|
13
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
14
|
+
import { RawImage } from "../../utils/image.js";
|
|
15
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
16
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
17
|
+
//# sourceMappingURL=processing_phi3_v.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processing_phi3_v.d.ts","sourceRoot":"","sources":["../../../src/models/phi3_v/processing_phi3_v.js"],"names":[],"mappings":"AAQA;IACI,wDAAiD;IACjD,6CAAsC;IAEtC;;;;;;OAMG;IACH,YALW,MAAM,GAAC,MAAM,EAAE,WACf,QAAQ,GAAC,QAAQ,EAAE,wCACf,GAAG,EAAA,GACL,OAAO,CAAC,GAAG,CAAC,CAkCxB;CACJ;0BApDyB,gCAAgC;yBAGjC,sBAAsB;mCAFZ,kCAAkC;8BACvC,qBAAqB"}
|