@huggingface/transformers 3.1.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +6 -3
  2. package/dist/transformers.cjs +777 -143
  3. package/dist/transformers.cjs.map +1 -1
  4. package/dist/transformers.js +787 -143
  5. package/dist/transformers.js.map +1 -1
  6. package/dist/transformers.min.cjs +1 -1
  7. package/dist/transformers.min.cjs.map +1 -1
  8. package/dist/transformers.min.js +1 -1
  9. package/dist/transformers.min.js.map +1 -1
  10. package/dist/transformers.min.mjs +1 -1
  11. package/dist/transformers.min.mjs.map +1 -1
  12. package/dist/transformers.mjs +787 -143
  13. package/dist/transformers.mjs.map +1 -1
  14. package/package.json +1 -1
  15. package/src/base/image_processors_utils.js +3 -1
  16. package/src/configs.js +10 -2
  17. package/src/env.js +1 -1
  18. package/src/models/feature_extractors.js +1 -0
  19. package/src/models/idefics3/image_processing_idefics3.js +24 -13
  20. package/src/models/image_processors.js +1 -0
  21. package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
  22. package/src/models/moonshine/processing_moonshine.js +20 -0
  23. package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
  24. package/src/models/phi3_v/processing_phi3_v.js +53 -0
  25. package/src/models/processors.js +2 -0
  26. package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
  27. package/src/models/pyannote/processing_pyannote.js +7 -54
  28. package/src/models.js +176 -30
  29. package/src/ops/registry.js +11 -0
  30. package/src/pipelines.js +30 -0
  31. package/src/utils/tensor.js +51 -1
  32. package/types/base/image_processors_utils.d.ts +2 -2
  33. package/types/base/image_processors_utils.d.ts.map +1 -1
  34. package/types/configs.d.ts.map +1 -1
  35. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  36. package/types/models/feature_extractors.d.ts +1 -0
  37. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  38. package/types/models/image_processors.d.ts +1 -0
  39. package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
  40. package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
  41. package/types/models/moonshine/processing_moonshine.d.ts +17 -0
  42. package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
  43. package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
  44. package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
  45. package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
  46. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
  47. package/types/models/processors.d.ts +2 -0
  48. package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
  49. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  50. package/types/models/pyannote/processing_pyannote.d.ts +4 -15
  51. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
  52. package/types/models.d.ts +33 -1
  53. package/types/models.d.ts.map +1 -1
  54. package/types/ops/registry.d.ts +1 -0
  55. package/types/ops/registry.d.ts.map +1 -1
  56. package/types/pipelines.d.ts +5 -0
  57. package/types/pipelines.d.ts.map +1 -1
  58. package/types/utils/tensor.d.ts +16 -0
  59. package/types/utils/tensor.d.ts.map +1 -1
package/src/models.js CHANGED
@@ -131,6 +131,7 @@ const MODEL_TYPES = {
131
131
  ImageTextToText: 6,
132
132
  Musicgen: 7,
133
133
  MultiModality: 8,
134
+ Phi3V: 9,
134
135
  }
135
136
  //////////////////////////////////////////////////
136
137
 
@@ -906,6 +907,10 @@ export class PreTrainedModel extends Callable {
906
907
  this._forward = imageTextToTextForward;
907
908
  this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
908
909
  break;
910
+ case MODEL_TYPES.Phi3V:
911
+ this.can_generate = true;
912
+ this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
913
+ break;
909
914
 
910
915
  case MODEL_TYPES.MultiModality:
911
916
  this.can_generate = true;
@@ -1070,6 +1075,18 @@ export class PreTrainedModel extends Callable {
1070
1075
  }, options),
1071
1076
  ]);
1072
1077
 
1078
+ } else if (modelType === MODEL_TYPES.Phi3V) {
1079
+ info = await Promise.all([
1080
+ constructSessions(pretrained_model_name_or_path, {
1081
+ prepare_inputs_embeds: 'prepare_inputs_embeds',
1082
+ model: 'model',
1083
+ vision_encoder: 'vision_encoder',
1084
+ }, options),
1085
+ getOptionalConfigs(pretrained_model_name_or_path, {
1086
+ generation_config: 'generation_config.json',
1087
+ }, options),
1088
+ ]);
1089
+
1073
1090
  } else { // should be MODEL_TYPES.EncoderOnly
1074
1091
  if (modelType !== MODEL_TYPES.EncoderOnly) {
1075
1092
  const type = modelName ?? config?.model_type;
@@ -3342,6 +3359,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3342
3359
  }
3343
3360
  //////////////////////////////////////////////////
3344
3361
 
3362
+
3363
+ //////////////////////////////////////////////////
3364
+ // Moonshine models
3365
+ export class MoonshinePreTrainedModel extends PreTrainedModel {
3366
+
3367
+ requires_attention_mask = false;
3368
+ main_input_name = 'input_values';
3369
+ forward_params = [
3370
+ 'input_values',
3371
+ 'decoder_input_ids',
3372
+ 'past_key_values',
3373
+ ];
3374
+ };
3375
+
3376
+ /**
3377
+ * MoonshineModel class for training Moonshine models without a language model head.
3378
+ */
3379
+ export class MoonshineModel extends MoonshinePreTrainedModel { }
3380
+
3381
+ export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
3382
+ //////////////////////////////////////////////////
3383
+
3384
+
3345
3385
  //////////////////////////////////////////////////
3346
3386
  /**
3347
3387
  * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
@@ -3612,6 +3652,77 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
3612
3652
  }
3613
3653
  //////////////////////////////////////////////////
3614
3654
 
3655
+ export class Phi3VPreTrainedModel extends PreTrainedModel {
3656
+ forward_params = [
3657
+ 'input_ids',
3658
+ 'inputs_embeds',
3659
+ 'attention_mask',
3660
+ 'position_ids',
3661
+ 'pixel_values',
3662
+ 'image_sizes',
3663
+ 'past_key_values',
3664
+ ];
3665
+ }
3666
+ export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
3667
+
3668
+ async forward({
3669
+ // Produced by the tokenizer/processor:
3670
+ input_ids = null,
3671
+ attention_mask = null,
3672
+ pixel_values = null,
3673
+ image_sizes = null,
3674
+
3675
+ // Used during generation:
3676
+ position_ids = null,
3677
+ inputs_embeds = null,
3678
+ past_key_values = null,
3679
+
3680
+ // Generic generation parameters
3681
+ generation_config = null,
3682
+ logits_processor = null,
3683
+
3684
+ // TODO: needed?
3685
+ ...kwargs
3686
+ }) {
3687
+ if (!inputs_embeds) {
3688
+ let image_features;
3689
+ if (pixel_values && input_ids.dims[1] !== 1) {
3690
+ if (!image_sizes) {
3691
+ throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
3692
+ }
3693
+
3694
+ // Encode the image
3695
+ ({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
3696
+ pixel_values,
3697
+ image_sizes,
3698
+ }));
3699
+ } else {
3700
+ const hidden_size = this.config.normalized_config.hidden_size;
3701
+ image_features = new Tensor(
3702
+ 'float32',
3703
+ [],
3704
+ [0, hidden_size],
3705
+ );
3706
+ }
3707
+
3708
+ ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
3709
+ input_ids,
3710
+ image_features,
3711
+ }));
3712
+ }
3713
+
3714
+ const outputs = await decoderForward(this, {
3715
+ inputs_embeds,
3716
+ past_key_values,
3717
+ attention_mask,
3718
+ position_ids,
3719
+ generation_config,
3720
+ logits_processor,
3721
+ }, false);
3722
+ return outputs;
3723
+ }
3724
+ }
3725
+
3615
3726
  //////////////////////////////////////////////////
3616
3727
  export class CLIPPreTrainedModel extends PreTrainedModel { }
3617
3728
 
@@ -3666,9 +3777,11 @@ export class CLIPModel extends CLIPPreTrainedModel { }
3666
3777
  export class CLIPTextModel extends CLIPPreTrainedModel {
3667
3778
  /** @type {typeof PreTrainedModel.from_pretrained} */
3668
3779
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3669
- // Update default model file name if not provided
3670
- options.model_file_name ??= 'text_model';
3671
- return super.from_pretrained(pretrained_model_name_or_path, options);
3780
+ return super.from_pretrained(pretrained_model_name_or_path, {
3781
+ // Update default model file name if not provided
3782
+ model_file_name: 'text_model',
3783
+ ...options,
3784
+ });
3672
3785
  }
3673
3786
  }
3674
3787
 
@@ -3701,9 +3814,11 @@ export class CLIPTextModel extends CLIPPreTrainedModel {
3701
3814
  export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
3702
3815
  /** @type {typeof PreTrainedModel.from_pretrained} */
3703
3816
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3704
- // Update default model file name if not provided
3705
- options.model_file_name ??= 'text_model';
3706
- return super.from_pretrained(pretrained_model_name_or_path, options);
3817
+ return super.from_pretrained(pretrained_model_name_or_path, {
3818
+ // Update default model file name if not provided
3819
+ model_file_name: 'text_model',
3820
+ ...options,
3821
+ });
3707
3822
  }
3708
3823
  }
3709
3824
 
@@ -3713,9 +3828,11 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
3713
3828
  export class CLIPVisionModel extends CLIPPreTrainedModel {
3714
3829
  /** @type {typeof PreTrainedModel.from_pretrained} */
3715
3830
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3716
- // Update default model file name if not provided
3717
- options.model_file_name ??= 'vision_model';
3718
- return super.from_pretrained(pretrained_model_name_or_path, options);
3831
+ return super.from_pretrained(pretrained_model_name_or_path, {
3832
+ // Update default model file name if not provided
3833
+ model_file_name: 'vision_model',
3834
+ ...options,
3835
+ });
3719
3836
  }
3720
3837
  }
3721
3838
 
@@ -3748,9 +3865,11 @@ export class CLIPVisionModel extends CLIPPreTrainedModel {
3748
3865
  export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
3749
3866
  /** @type {typeof PreTrainedModel.from_pretrained} */
3750
3867
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3751
- // Update default model file name if not provided
3752
- options.model_file_name ??= 'vision_model';
3753
- return super.from_pretrained(pretrained_model_name_or_path, options);
3868
+ return super.from_pretrained(pretrained_model_name_or_path, {
3869
+ // Update default model file name if not provided
3870
+ model_file_name: 'vision_model',
3871
+ ...options,
3872
+ });
3754
3873
  }
3755
3874
  }
3756
3875
  //////////////////////////////////////////////////
@@ -3834,9 +3953,11 @@ export class SiglipModel extends SiglipPreTrainedModel { }
3834
3953
  export class SiglipTextModel extends SiglipPreTrainedModel {
3835
3954
  /** @type {typeof PreTrainedModel.from_pretrained} */
3836
3955
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3837
- // Update default model file name if not provided
3838
- options.model_file_name ??= 'text_model';
3839
- return super.from_pretrained(pretrained_model_name_or_path, options);
3956
+ return super.from_pretrained(pretrained_model_name_or_path, {
3957
+ // Update default model file name if not provided
3958
+ model_file_name: 'text_model',
3959
+ ...options,
3960
+ });
3840
3961
  }
3841
3962
  }
3842
3963
 
@@ -3869,9 +3990,11 @@ export class SiglipTextModel extends SiglipPreTrainedModel {
3869
3990
  export class SiglipVisionModel extends CLIPPreTrainedModel {
3870
3991
  /** @type {typeof PreTrainedModel.from_pretrained} */
3871
3992
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3872
- // Update default model file name if not provided
3873
- options.model_file_name ??= 'vision_model';
3874
- return super.from_pretrained(pretrained_model_name_or_path, options);
3993
+ return super.from_pretrained(pretrained_model_name_or_path, {
3994
+ // Update default model file name if not provided
3995
+ model_file_name: 'vision_model',
3996
+ ...options,
3997
+ });
3875
3998
  }
3876
3999
  }
3877
4000
  //////////////////////////////////////////////////
@@ -3926,18 +4049,22 @@ export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
3926
4049
  export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
3927
4050
  /** @type {typeof PreTrainedModel.from_pretrained} */
3928
4051
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3929
- // Update default model file name if not provided
3930
- options.model_file_name ??= 'text_model';
3931
- return super.from_pretrained(pretrained_model_name_or_path, options);
4052
+ return super.from_pretrained(pretrained_model_name_or_path, {
4053
+ // Update default model file name if not provided
4054
+ model_file_name: 'text_model',
4055
+ ...options,
4056
+ });
3932
4057
  }
3933
4058
  }
3934
4059
 
3935
4060
  export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
3936
4061
  /** @type {typeof PreTrainedModel.from_pretrained} */
3937
4062
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3938
- // Update default model file name if not provided
3939
- options.model_file_name ??= 'vision_model';
3940
- return super.from_pretrained(pretrained_model_name_or_path, options);
4063
+ return super.from_pretrained(pretrained_model_name_or_path, {
4064
+ // Update default model file name if not provided
4065
+ model_file_name: 'vision_model',
4066
+ ...options,
4067
+ });
3941
4068
  }
3942
4069
  }
3943
4070
  //////////////////////////////////////////////////
@@ -4097,6 +4224,14 @@ export class LlamaForCausalLM extends LlamaPreTrainedModel { }
4097
4224
  //////////////////////////////////////////////////
4098
4225
 
4099
4226
 
4227
+ //////////////////////////////////////////////////
4228
+ // EXAONE models
4229
+ export class ExaonePreTrainedModel extends PreTrainedModel { }
4230
+ export class ExaoneModel extends ExaonePreTrainedModel { }
4231
+ export class ExaoneForCausalLM extends ExaonePreTrainedModel { }
4232
+ //////////////////////////////////////////////////
4233
+
4234
+
4100
4235
  //////////////////////////////////////////////////
4101
4236
  // MobileLLM models
4102
4237
  export class MobileLLMPreTrainedModel extends PreTrainedModel { }
@@ -6159,9 +6294,11 @@ export class ClapModel extends ClapPreTrainedModel { }
6159
6294
  export class ClapTextModelWithProjection extends ClapPreTrainedModel {
6160
6295
  /** @type {typeof PreTrainedModel.from_pretrained} */
6161
6296
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
6162
- // Update default model file name if not provided
6163
- options.model_file_name ??= 'text_model';
6164
- return super.from_pretrained(pretrained_model_name_or_path, options);
6297
+ return super.from_pretrained(pretrained_model_name_or_path, {
6298
+ // Update default model file name if not provided
6299
+ model_file_name: 'text_model',
6300
+ ...options,
6301
+ });
6165
6302
  }
6166
6303
  }
6167
6304
 
@@ -6194,9 +6331,11 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel {
6194
6331
  export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
6195
6332
  /** @type {typeof PreTrainedModel.from_pretrained} */
6196
6333
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
6197
- // Update default model file name if not provided
6198
- options.model_file_name ??= 'audio_model';
6199
- return super.from_pretrained(pretrained_model_name_or_path, options);
6334
+ return super.from_pretrained(pretrained_model_name_or_path, {
6335
+ // Update default model file name if not provided
6336
+ model_file_name: 'audio_model',
6337
+ ...options,
6338
+ });
6200
6339
  }
6201
6340
  }
6202
6341
  //////////////////////////////////////////////////
@@ -6883,6 +7022,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
6883
7022
  ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
6884
7023
  ['codegen', ['CodeGenModel', CodeGenModel]],
6885
7024
  ['llama', ['LlamaModel', LlamaModel]],
7025
+ ['exaone', ['ExaoneModel', ExaoneModel]],
6886
7026
  ['olmo', ['OlmoModel', OlmoModel]],
6887
7027
  ['olmo2', ['Olmo2Model', Olmo2Model]],
6888
7028
  ['mobilellm', ['MobileLLMModel', MobileLLMModel]],
@@ -6905,6 +7045,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
6905
7045
  const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
6906
7046
  ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
6907
7047
  ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
7048
+ ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
6908
7049
  ]);
6909
7050
 
6910
7051
  const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
@@ -6975,6 +7116,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
6975
7116
  ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
6976
7117
  ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
6977
7118
  ['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
7119
+ ['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
6978
7120
  ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
6979
7121
  ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
6980
7122
  ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
@@ -6994,6 +7136,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
6994
7136
  ['falcon', ['FalconForCausalLM', FalconForCausalLM]],
6995
7137
  ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
6996
7138
  ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
7139
+
7140
+ // Also image-text-to-text
7141
+ ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
6997
7142
  ]);
6998
7143
 
6999
7144
  const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
@@ -7231,6 +7376,7 @@ const CUSTOM_MAPPING = [
7231
7376
  // OVERRIDE:
7232
7377
  // TODO: Refactor to allow class to specify model
7233
7378
  ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
7379
+ ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
7234
7380
 
7235
7381
  ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
7236
7382
  ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
@@ -100,4 +100,15 @@ export class TensorOpRegistry {
100
100
  }
101
101
  return this._top_k;
102
102
  }
103
+
104
+ static get slice() {
105
+ if (!this._slice) {
106
+ this._slice = wrap(
107
+ [8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
108
+ this.session_options,
109
+ 'y',
110
+ )
111
+ }
112
+ return this._slice;
113
+ }
103
114
  }
package/src/pipelines.js CHANGED
@@ -1729,6 +1729,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1729
1729
  case 'unispeech-sat':
1730
1730
  case 'hubert':
1731
1731
  return this._call_wav2vec2(audio, kwargs)
1732
+ case 'moonshine':
1733
+ return this._call_moonshine(audio, kwargs)
1732
1734
  default:
1733
1735
  throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
1734
1736
  }
@@ -1882,6 +1884,34 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1882
1884
  }
1883
1885
  return single ? toReturn[0] : toReturn;
1884
1886
  }
1887
+
1888
+ /**
1889
+ * @type {AutomaticSpeechRecognitionPipelineCallback}
1890
+ * @private
1891
+ */
1892
+ async _call_moonshine(audio, kwargs) {
1893
+ const single = !Array.isArray(audio);
1894
+ if (single) {
1895
+ audio = [/** @type {AudioInput} */ (audio)];
1896
+ }
1897
+ const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1898
+ const preparedAudios = await prepareAudios(audio, sampling_rate);
1899
+ const toReturn = [];
1900
+ for (const aud of preparedAudios) {
1901
+ const inputs = await this.processor(aud);
1902
+
1903
+ // According to the [paper](https://arxiv.org/pdf/2410.15608):
1904
+ // "We use greedy decoding, with a heuristic limit of 6 output tokens
1905
+ // per second of audio to avoid repeated output sequences."
1906
+ const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
1907
+ const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
1908
+
1909
+ const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
1910
+ toReturn.push({ text });
1911
+ }
1912
+ return single ? toReturn[0] : toReturn;
1913
+ }
1914
+
1885
1915
  }
1886
1916
 
1887
1917
  /**
@@ -772,8 +772,21 @@ export class Tensor {
772
772
  if (!DataTypeMap.hasOwnProperty(type)) {
773
773
  throw new Error(`Unsupported type: ${type}`);
774
774
  }
775
+
776
+ // Handle special cases where a mapping function is needed (e.g., where one type is a bigint and the other is a number)
777
+ let map_fn;
778
+ const is_source_bigint = ['int64', 'uint64'].includes(this.type);
779
+ const is_dest_bigint = ['int64', 'uint64'].includes(type);
780
+ if (is_source_bigint && !is_dest_bigint) {
781
+ // TypeError: Cannot convert a BigInt value to a number
782
+ map_fn = Number;
783
+ } else if (!is_source_bigint && is_dest_bigint) {
784
+ // TypeError: Cannot convert [x] to a BigInt
785
+ map_fn = BigInt;
786
+ }
787
+
775
788
  // @ts-ignore
776
- return new Tensor(type, DataTypeMap[type].from(this.data), this.dims);
789
+ return new Tensor(type, DataTypeMap[type].from(this.data, map_fn), this.dims);
777
790
  }
778
791
  }
779
792
 
@@ -971,6 +984,29 @@ export async function topk(x, k) {
971
984
  });
972
985
  }
973
986
 
987
+
988
+ const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length]);
989
+ /**
990
+ * Slice a multidimensional float32 tensor.
991
+ * @param {Tensor} data: Tensor of data to extract slices from
992
+ * @param {number[]} starts: 1-D array of starting indices of corresponding axis in axes
993
+ * @param {number[]} ends: 1-D array of ending indices (exclusive) of corresponding axis in axes
994
+ * @param {number[]} axes: 1-D array of axes that starts and ends apply to
995
+ * @param {number[]} [steps]: 1-D array of slice step of corresponding axis in axes.
996
+ * @returns {Promise<Tensor>} Sliced data tensor.
997
+ */
998
+ export async function slice(data, starts, ends, axes, steps) {
999
+ const op = await TensorOpRegistry.slice;
1000
+ return await op({
1001
+ x: data,
1002
+ s: arrayToIndexTensor(starts),
1003
+ e: arrayToIndexTensor(ends),
1004
+ a: arrayToIndexTensor(axes),
1005
+ t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
1006
+ });
1007
+ }
1008
+
1009
+
974
1010
  /**
975
1011
  * Perform mean pooling of the last hidden state followed by a normalization step.
976
1012
  * @param {Tensor} last_hidden_state Tensor of shape [batchSize, seqLength, embedDim]
@@ -1417,6 +1453,20 @@ export function zeros_like(tensor) {
1417
1453
  return zeros(tensor.dims);
1418
1454
  }
1419
1455
 
1456
+ /**
1457
+ * Returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)
1458
+ * @param {number[]} size A sequence of integers defining the shape of the output tensor.
1459
+ * @returns {Tensor} The random tensor.
1460
+ */
1461
+ export function rand(size) {
1462
+ const length = size.reduce((a, b) => a * b, 1);
1463
+ return new Tensor(
1464
+ "float32",
1465
+ Float32Array.from({ length }, () => Math.random()),
1466
+ size,
1467
+ )
1468
+ }
1469
+
1420
1470
  /**
1421
1471
  * Quantizes the embeddings tensor to binary or unsigned binary precision.
1422
1472
  * @param {Tensor} tensor The tensor to quantize.
@@ -149,7 +149,7 @@ export class ImageProcessor extends ImageProcessor_base {
149
149
  * Pad the image by a certain amount.
150
150
  * @param {Float32Array} pixelData The pixel data to pad.
151
151
  * @param {number[]} imgDims The dimensions of the image (height, width, channels).
152
- * @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
152
+ * @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
153
153
  * @param {Object} options The options for padding.
154
154
  * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
155
155
  * @param {boolean} [options.center=false] Whether to center the image.
@@ -159,7 +159,7 @@ export class ImageProcessor extends ImageProcessor_base {
159
159
  pad_image(pixelData: Float32Array, imgDims: number[], padSize: {
160
160
  width: number;
161
161
  height: number;
162
- } | number, { mode, center, constant_values, }?: {
162
+ } | number | "square", { mode, center, constant_values, }?: {
163
163
  mode?: "constant" | "symmetric";
164
164
  center?: boolean;
165
165
  constant_values?: number | number[];
@@ -1 +1 @@
1
- {"version":3,"file":"image_processors_utils.d.ts","sourceRoot":"","sources":["../../src/base/image_processors_utils.js"],"names":[],"mappings":"AA+EA;;;;;;;;;GASG;AACH,uDAPG;IAAwB,MAAM,EAAtB,MAAM;IACU,UAAU,EAA1B,MAAM;CACd,cAAQ,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,iBAClB,OAAO,GACN,KAAQ,CAwEnB;AAGD;;;;;;GAMG;AACH,4DALW,GAAC,iBACD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB;IAAC,YAAY,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,EAAE,CAAA;CAAC,EAAE,CAwDtD;AAkPD;;;;;;;;;GASG;AACH,4DARW,GAAC,cACD,MAAM,mBACN,MAAM,gCACN,MAAM,sBACN,GAAG,CAAC,MAAM,CAAC,iBACX,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAChB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAuE/G;AAGD;;;;;;;GAOG;AACH,4DANW,GAAC,cACD,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAI/G;;KA3iBsC,GAAG;UAAyB,GACnE;;AA6iBA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH;IAmeI;;;;;;;;;;;;;;OAcG;IACH,sDATW,MAAM,WAKN,OAAO,iBAAiB,EAAE,iBAAiB,GAEzC,OAAO,CAAC,cAAc,CAAC,CAKnC;IAnfD;;;OAGG;IACH,oBAFW,oBAAoB,EAmC9B;IA9BG,qBAAkD;IAClD,oBAA+C;IAE/C,iBAAoC;IACpC,oBAA2C;IAC3C,uBAAwD;IACxD,sBAAuC;IAEvC,sBAAuC;IACvC,UAA4C;IAC5C,mBAA8D;IAC9D,uBAAwE;IAExE,wBAA2C;IAC3C,eAAiC;IACjC,oBAAmD;IACnD,oBAA2C;IAE3C,cAA+B;IAC/B,YAA2B;IAQ3B,+BAAkE;IAElE,6BAAoB;IAGxB;;;;;;;OAOG;IACH,iBALW,QAAQ,QACR;QAAC,MAAM,EAAC,MAAM,CAAC;QAAC,KAAK,EAAC,MAAM,CAAA;KAAC,aAC7B,MAAM,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAC5B,OAAO,CAAC,QAAQ,CAAC,CAsB7B;IAGD;;;;;OAKG;IACH,mBAJW,QAAQ,mBACR,MAAM,GACJ,OAAO,CAAC,QAAQ,CAAC,CAiC7B;IAED;;;;;;;;;;OAUG;IACH,qBATW,YAAY,WACZ,MAAM,EAAE,WACR;QAAC,KAAK,EAAC,MAAM,CAAC;QAAC,MAAM,EAAC,MAAM,CAAA;KAAC,GAAC,MAAM,uCAE5C;QAAyC,IAAI,GAArC,UAAU,GAAC,WAAW;QACJ,MAAM,GAAxB,OAAO;QACmB,eAAe,GAAzC,MAAM,GAAC,MAAM,EAAE;KACvB,GAAU,CAAC,YAAY,EAAE,MAAM,EAAE,CAAC,CA6EpC;IAED;;;;OAIG;IACH,mBAHW,YAAY,GACV,IAAI,CAMhB;IAED;;;;;;OAMG;IACH,oCAJW,QAAQ,QACR,GAAG,GACD,CAAC,MAAM,EAAE,MAAM,CAAC,CA4F5B;IAED;;;;OAIG;IACH,cAHW,QAAQ,GACN,OAAO,CAAC,QAAQ,CAAC,CAO7B;IAED;;;;;OAKG;IAEH;;;;;;OAMG;IACH,kBAJW,QAAQ,iGAEN,OAAO;;;;uBAVN,WAAW;;;;6BACX,WAAW;;;;sBACX,MAAM;MAQmB,CAsHtC;IAED;;;;;;;OAOG;IACH,cAJW,QAAQ,EAAE,WACP,GAAG,EAAA,GACJ,OAAO,CAAC,oBAAoB,CAAC,CAqBzC;CAsBJ;;;;;0BArjCY,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC;;;;;kBAM9B,MAAM;;;;oBACN,WAAW,EAAE;;;;0BACb,WAAW,EAAE;;;;;;;;;;;;;iBAgiBb,MAAM,EAAE;;;;gBACR,MAAM,EAAE;;;;iBACR,OAAO;;;;qBACP,MAAM;;;;mBACN,OAAO;;;;gBACP,OAAO;;;;eACP,MAAM;;;;WACN,MAAM,MAAO;;;;iBACb,MAAM,MAAO;;;;;4BACb,OAAO;;;;;qBAEP,OAAO;;;;mBAEP,OAAO;;;;;wBACP,OAAO;;;;;yBAEP,MAAM;;;;WAGN,MAAM,EAAE;;;;UACR,MAAM,EAAE;;uBAtkBqB,oBAAoB;yBAEtC,mBAAmB"}
1
+ {"version":3,"file":"image_processors_utils.d.ts","sourceRoot":"","sources":["../../src/base/image_processors_utils.js"],"names":[],"mappings":"AA+EA;;;;;;;;;GASG;AACH,uDAPG;IAAwB,MAAM,EAAtB,MAAM;IACU,UAAU,EAA1B,MAAM;CACd,cAAQ,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,iBAClB,OAAO,GACN,KAAQ,CAwEnB;AAGD;;;;;;GAMG;AACH,4DALW,GAAC,iBACD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB;IAAC,YAAY,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,EAAE,CAAA;CAAC,EAAE,CAwDtD;AAkPD;;;;;;;;;GASG;AACH,4DARW,GAAC,cACD,MAAM,mBACN,MAAM,gCACN,MAAM,sBACN,GAAG,CAAC,MAAM,CAAC,iBACX,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAChB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAuE/G;AAGD;;;;;;;GAOG;AACH,4DANW,GAAC,cACD,MAAM,iBACN,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAEhB,KAAK,CAAC;IAAE,YAAY,EAAE,MAAM,CAAC;IAAC,aAAa,EAAE,KAAK,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAC,CAAC,CAAA;CAAC,CAAC,CAI/G;;KA3iBsC,GAAG;UAAyB,GACnE;;AA6iBA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH;IAqeI;;;;;;;;;;;;;;OAcG;IACH,sDATW,MAAM,WAKN,OAAO,iBAAiB,EAAE,iBAAiB,GAEzC,OAAO,CAAC,cAAc,CAAC,CAKnC;IArfD;;;OAGG;IACH,oBAFW,oBAAoB,EAmC9B;IA9BG,qBAAkD;IAClD,oBAA+C;IAE/C,iBAAoC;IACpC,oBAA2C;IAC3C,uBAAwD;IACxD,sBAAuC;IAEvC,sBAAuC;IACvC,UAA4C;IAC5C,mBAA8D;IAC9D,uBAAwE;IAExE,wBAA2C;IAC3C,eAAiC;IACjC,oBAAmD;IACnD,oBAA2C;IAE3C,cAA+B;IAC/B,YAA2B;IAQ3B,+BAAkE;IAElE,6BAAoB;IAGxB;;;;;;;OAOG;IACH,iBALW,QAAQ,QACR;QAAC,MAAM,EAAC,MAAM,CAAC;QAAC,KAAK,EAAC,MAAM,CAAA;KAAC,aAC7B,MAAM,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAC5B,OAAO,CAAC,QAAQ,CAAC,CAsB7B;IAGD;;;;;OAKG;IACH,mBAJW,QAAQ,mBACR,MAAM,GACJ,OAAO,CAAC,QAAQ,CAAC,CAiC7B;IAED;;;;;;;;;;OAUG;IACH,qBATW,YAAY,WACZ,MAAM,EAAE,WACR;QAAC,KAAK,EAAC,MAAM,CAAC;QAAC,MAAM,EAAC,MAAM,CAAA;KAAC,GAAC,MAAM,GAAC,QAAQ,uCAErD;QAAyC,IAAI,GAArC,UAAU,GAAC,WAAW;QACJ,MAAM,GAAxB,OAAO;QACmB,eAAe,GAAzC,MAAM,GAAC,MAAM,EAAE;KACvB,GAAU,CAAC,YAAY,EAAE,MAAM,EAAE,CAAC,CA+EpC;IAED;;;;OAIG;IACH,mBAHW,YAAY,GACV,IAAI,CAMhB;IAED;;;;;;OAMG;IACH,oCAJW,QAAQ,QACR,GAAG,GACD,CAAC,MAAM,EAAE,MAAM,CAAC,CA4F5B;IAED;;;;OAIG;IACH,cAHW,QAAQ,GACN,OAAO,CAAC,QAAQ,CAAC,CAO7B;IAED;;;;;OAKG;IAEH;;;;;;OAMG;IACH,kBAJW,QAAQ,iGAEN,OAAO;;;;uBAVN,WAAW;;;;6BACX,WAAW;;;;sBACX,MAAM;MAQmB,CAsHtC;IAED;;;;;;;OAOG;IACH,cAJW,QAAQ,EAAE,WACP,GAAG,EAAA,GACJ,OAAO,CAAC,oBAAoB,CAAC,CAqBzC;CAsBJ;;;;;0BAvjCY,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC;;;;;kBAM9B,MAAM;;;;oBACN,WAAW,EAAE;;;;0BACb,WAAW,EAAE;;;;;;;;;;;;;iBAgiBb,MAAM,EAAE;;;;gBACR,MAAM,EAAE;;;;iBACR,OAAO;;;;qBACP,MAAM;;;;mBACN,OAAO;;;;gBACP,OAAO;;;;eACP,MAAM;;;;WACN,MAAM,MAAO;;;;iBACb,MAAM,MAAO;;;;;4BACb,OAAO;;;;;qBAEP,OAAO;;;;mBAEP,OAAO;;;;;wBACP,OAAO;;;;;yBAEP,MAAM;;;;WAGN,MAAM,EAAE;;;;UACR,MAAM,EAAE;;uBAtkBqB,oBAAoB;yBAEtC,mBAAmB"}
@@ -1 +1 @@
1
- {"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"AAmOA;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;gCArVY,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;qBAkVrC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC"}
1
+ {"version":3,"file":"configs.d.ts","sourceRoot":"","sources":["../src/configs.js"],"names":[],"mappings":"AA2OA;;;;GAIG;AACH,0CAHW,gBAAgB;;;IACd,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CA2EpC;AACD;;;GAGG;AACH;IAwBI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;IArCD;;;OAGG;IACH,6BAGC;IAnBD,0BAA0B;IAC1B,YADW,MAAM,GAAC,IAAI,CACJ;IAElB,sBAAsB;IACtB,oBADW,OAAO,CACS;IAE3B,qBAAqB;IACrB,yBADW,MAAM,CACO;IAExB,mCAAmC;IACnC,0BADW,oBAAoB,CACN;IAQrB,uBAAkD;CAgCzD;AAED;;;;;GAKG;AACH;IArCI;;;;;;;;OAQG;IACH,sDANW,MAAM,0EACN,iBAAiB,GAGf,OAAO,CAAC,gBAAgB,CAAC,CAqBrC;CAcJ;gCA7VY,OAAO,gBAAgB,EAAE,iBAAiB;+BAI1C,OAAO,iBAAiB,EAAE,gBAAgB;2BAI1C,OAAO,iBAAiB,EAAE,YAAY;;;;;;;;qBA0VrC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,OAAO,mBAAmB,EAAE,QAAQ,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;;;+BACvH,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;;;;aAGtB,OAAO,oBAAoB,EAAE,UAAU;;;;YACvC,OAAO,mBAAmB,EAAE,QAAQ,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,mBAAmB,EAAE,QAAQ,CAAC;;;;+BACzF,OAAO,GAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"image_processing_auto.d.ts","sourceRoot":"","sources":["../../../src/models/auto/image_processing_auto.js"],"names":[],"mappings":"AAMA;kFAuBk7xC,oBAAiB;CADl8xC;+BAzB8B,sCAAsC"}
1
+ {"version":3,"file":"image_processing_auto.d.ts","sourceRoot":"","sources":["../../../src/models/auto/image_processing_auto.js"],"names":[],"mappings":"AAMA;kFAuB4jyC,oBAAiB;CAD5kyC;+BAzB8B,sCAAsC"}
@@ -1,5 +1,6 @@
1
1
  export * from "./audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js";
2
2
  export * from "./clap/feature_extraction_clap.js";
3
+ export * from "./moonshine/feature_extraction_moonshine.js";
3
4
  export * from "./pyannote/feature_extraction_pyannote.js";
4
5
  export * from "./seamless_m4t/feature_extraction_seamless_m4t.js";
5
6
  export * from "./speecht5/feature_extraction_speecht5.js";
@@ -1 +1 @@
1
- {"version":3,"file":"image_processing_idefics3.d.ts","sourceRoot":"","sources":["../../../src/models/idefics3/image_processing_idefics3.js"],"names":[],"mappings":"AAOA;IACI,yBAKC;IAFG,wBAA2D;IAC3D,oBAA2C;IAG/C;;;OAGG;IAEH;;;;;OAKG;IACH,6GAHW,MAAM;;;MAiBhB;IAED,uDAAuD;IACvD,cADY,0CAAS,yCAAU,GAAC,yCAAU,EAAE;;;;;;;;;;OA4H3C;IAED;;;;;;OAiDC;CACJ;+BAtNM,sCAAsC"}
1
+ {"version":3,"file":"image_processing_idefics3.d.ts","sourceRoot":"","sources":["../../../src/models/idefics3/image_processing_idefics3.js"],"names":[],"mappings":"AAOA;IACI,yBAKC;IAFG,wBAA2D;IAC3D,oBAA2C;IAG/C;;;OAGG;IAEH;;;;;OAKG;IACH,6GAHW,MAAM;;;MAiBhB;IAED,uDAAuD;IACvD,cADY,0CAAS,yCAAU,GAAC,yCAAU,EAAE;;;;;;;;;;OA4H3C;IAED;;;;;;OA4DC;CACJ;+BAjOM,sCAAsC"}
@@ -23,6 +23,7 @@ export * from "./mobilevit/image_processing_mobilevit.js";
23
23
  export * from "./nougat/image_processing_nougat.js";
24
24
  export * from "./owlv2/image_processing_owlv2.js";
25
25
  export * from "./owlvit/image_processing_owlvit.js";
26
+ export * from "./phi3_v/image_processing_phi3_v.js";
26
27
  export * from "./pvt/image_processing_pvt.js";
27
28
  export * from "./qwen2_vl/image_processing_qwen2_vl.js";
28
29
  export * from "./rt_detr/image_processing_rt_detr.js";
@@ -0,0 +1,13 @@
1
+ export class MoonshineFeatureExtractor extends FeatureExtractor {
2
+ /**
3
+ * Asynchronously extracts input values from a given audio using the provided configuration.
4
+ * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
5
+ * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
6
+ */
7
+ _call(audio: Float32Array | Float64Array): Promise<{
8
+ input_values: Tensor;
9
+ }>;
10
+ }
11
+ import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
12
+ import { Tensor } from '../../utils/tensor.js';
13
+ //# sourceMappingURL=feature_extraction_moonshine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"feature_extraction_moonshine.d.ts","sourceRoot":"","sources":["../../../src/models/moonshine/feature_extraction_moonshine.js"],"names":[],"mappings":"AAIA;IACI;;;;OAIG;IACH,aAHW,YAAY,GAAC,YAAY,GACvB,OAAO,CAAC;QAAE,YAAY,EAAE,MAAM,CAAC;KAAE,CAAC,CAgB9C;CACJ;iCAzBuD,wCAAwC;uBACzE,uBAAuB"}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Represents a MoonshineProcessor that extracts features from an audio input.
3
+ */
4
+ export class MoonshineProcessor extends Processor {
5
+ static tokenizer_class: typeof AutoTokenizer;
6
+ static feature_extractor_class: typeof AutoFeatureExtractor;
7
+ /**
8
+ * Calls the feature_extractor function with the given audio input.
9
+ * @param {any} audio The audio input to extract features from.
10
+ * @returns {Promise<any>} A Promise that resolves with the extracted features.
11
+ */
12
+ _call(audio: any): Promise<any>;
13
+ }
14
+ import { Processor } from "../../base/processing_utils.js";
15
+ import { AutoTokenizer } from "../../tokenizers.js";
16
+ import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
17
+ //# sourceMappingURL=processing_moonshine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processing_moonshine.d.ts","sourceRoot":"","sources":["../../../src/models/moonshine/processing_moonshine.js"],"names":[],"mappings":"AAIA;;GAEG;AACH;IACI,6CAAsC;IACtC,4DAAqD;IAErD;;;;OAIG;IACH,aAHW,GAAG,GACD,OAAO,CAAC,GAAG,CAAC,CAIxB;CACJ;0BAjByB,gCAAgC;8BAD5B,qBAAqB;qCADd,oCAAoC"}
@@ -0,0 +1,17 @@
1
+ export class Phi3VImageProcessor extends ImageProcessor {
2
+ constructor(config: any);
3
+ _num_crops: any;
4
+ calc_num_image_tokens_from_image_size(width: any, height: any): number;
5
+ _call(images: any, { num_crops, }?: {
6
+ num_crops?: any;
7
+ }): Promise<{
8
+ pixel_values: Tensor;
9
+ original_sizes: any[];
10
+ reshaped_input_sizes: any[];
11
+ image_sizes: Tensor;
12
+ num_img_tokens: number[];
13
+ }>;
14
+ }
15
+ import { ImageProcessor } from "../../base/image_processors_utils.js";
16
+ import { Tensor } from "../../utils/tensor.js";
17
+ //# sourceMappingURL=image_processing_phi3_v.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"image_processing_phi3_v.d.ts","sourceRoot":"","sources":["../../../src/models/phi3_v/image_processing_phi3_v.js"],"names":[],"mappings":"AASA;IACI,yBAWC;IADG,gBAAkC;IAEtC,uEAIC;IA0CD;;;;;;;;OA6FC;CACJ;+BAhKM,sCAAsC;uBACa,uBAAuB"}
@@ -0,0 +1,17 @@
1
+ export class Phi3VProcessor extends Processor {
2
+ static image_processor_class: typeof AutoImageProcessor;
3
+ static tokenizer_class: typeof AutoTokenizer;
4
+ /**
5
+ *
6
+ * @param {string|string[]} text
7
+ * @param {RawImage|RawImage[]} images
8
+ * @param {...any} args
9
+ * @returns {Promise<any>}
10
+ */
11
+ _call(text: string | string[], images?: RawImage | RawImage[], { padding, truncation, num_crops, }?: any[]): Promise<any>;
12
+ }
13
+ import { Processor } from "../../base/processing_utils.js";
14
+ import { RawImage } from "../../utils/image.js";
15
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
16
+ import { AutoTokenizer } from "../../tokenizers.js";
17
+ //# sourceMappingURL=processing_phi3_v.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processing_phi3_v.d.ts","sourceRoot":"","sources":["../../../src/models/phi3_v/processing_phi3_v.js"],"names":[],"mappings":"AAQA;IACI,wDAAiD;IACjD,6CAAsC;IAEtC;;;;;;OAMG;IACH,YALW,MAAM,GAAC,MAAM,EAAE,WACf,QAAQ,GAAC,QAAQ,EAAE,wCACf,GAAG,EAAA,GACL,OAAO,CAAC,GAAG,CAAC,CAkCxB;CACJ;0BApDyB,gCAAgC;yBAGjC,sBAAsB;mCAFZ,kCAAkC;8BACvC,qBAAqB"}