@huggingface/transformers 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +10 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +1062 -183
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +2239 -1232
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +1 -358
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +1 -421
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +1 -358
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +1082 -181
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +11 -16
  16. package/src/backends/onnx.js +2 -7
  17. package/src/base/image_processors_utils.js +3 -1
  18. package/src/configs.js +11 -2
  19. package/src/env.js +1 -1
  20. package/src/models/feature_extractors.js +1 -0
  21. package/src/models/idefics3/image_processing_idefics3.js +24 -13
  22. package/src/models/image_processors.js +1 -0
  23. package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
  24. package/src/models/moonshine/processing_moonshine.js +20 -0
  25. package/src/models/paligemma/processing_paligemma.js +82 -0
  26. package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
  27. package/src/models/phi3_v/processing_phi3_v.js +53 -0
  28. package/src/models/processors.js +3 -0
  29. package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
  30. package/src/models/pyannote/processing_pyannote.js +7 -54
  31. package/src/models.js +233 -35
  32. package/src/ops/registry.js +11 -0
  33. package/src/pipelines.js +30 -0
  34. package/src/tokenizers.js +12 -1
  35. package/src/utils/core.js +39 -9
  36. package/src/utils/hub.js +8 -12
  37. package/src/utils/image.js +40 -0
  38. package/src/utils/tensor.js +51 -1
  39. package/types/backends/onnx.d.ts +2 -2
  40. package/types/backends/onnx.d.ts.map +1 -1
  41. package/types/base/feature_extraction_utils.d.ts +1 -1
  42. package/types/base/feature_extraction_utils.d.ts.map +1 -1
  43. package/types/base/image_processors_utils.d.ts +4 -4
  44. package/types/base/image_processors_utils.d.ts.map +1 -1
  45. package/types/base/processing_utils.d.ts +4 -4
  46. package/types/base/processing_utils.d.ts.map +1 -1
  47. package/types/configs.d.ts +7 -7
  48. package/types/configs.d.ts.map +1 -1
  49. package/types/env.d.ts +1 -1
  50. package/types/env.d.ts.map +1 -1
  51. package/types/generation/configuration_utils.d.ts +2 -2
  52. package/types/generation/logits_process.d.ts +2 -2
  53. package/types/generation/logits_process.d.ts.map +1 -1
  54. package/types/generation/logits_sampler.d.ts.map +1 -1
  55. package/types/generation/parameters.d.ts +5 -5
  56. package/types/generation/stopping_criteria.d.ts +1 -1
  57. package/types/generation/stopping_criteria.d.ts.map +1 -1
  58. package/types/generation/streamers.d.ts +2 -2
  59. package/types/generation/streamers.d.ts.map +1 -1
  60. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
  61. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
  62. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  63. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  64. package/types/models/auto/processing_auto.d.ts +1 -1
  65. package/types/models/auto/processing_auto.d.ts.map +1 -1
  66. package/types/models/clap/feature_extraction_clap.d.ts +1 -1
  67. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
  68. package/types/models/detr/image_processing_detr.d.ts +11 -11
  69. package/types/models/detr/image_processing_detr.d.ts.map +1 -1
  70. package/types/models/donut/image_processing_donut.d.ts +1 -1
  71. package/types/models/donut/image_processing_donut.d.ts.map +1 -1
  72. package/types/models/feature_extractors.d.ts +1 -0
  73. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  74. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
  75. package/types/models/idefics3/processing_idefics3.d.ts.map +1 -1
  76. package/types/models/image_processors.d.ts +1 -0
  77. package/types/models/janus/image_processing_janus.d.ts +1 -1
  78. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  79. package/types/models/janus/processing_janus.d.ts.map +1 -1
  80. package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
  81. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
  82. package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
  83. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  84. package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
  85. package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
  86. package/types/models/moonshine/processing_moonshine.d.ts +17 -0
  87. package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
  88. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
  89. package/types/models/paligemma/processing_paligemma.d.ts +12 -0
  90. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
  91. package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
  92. package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
  93. package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
  94. package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
  95. package/types/models/processors.d.ts +3 -0
  96. package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
  97. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  98. package/types/models/pyannote/processing_pyannote.d.ts +4 -15
  99. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
  100. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  101. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
  102. package/types/models/sam/image_processing_sam.d.ts.map +1 -1
  103. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
  104. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
  105. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
  106. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
  107. package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
  108. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
  109. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
  110. package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
  111. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
  112. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
  113. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
  114. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
  115. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
  116. package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
  117. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  118. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  119. package/types/models/whisper/processing_whisper.d.ts.map +1 -1
  120. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
  121. package/types/models.d.ts +61 -5
  122. package/types/models.d.ts.map +1 -1
  123. package/types/ops/registry.d.ts +1 -0
  124. package/types/ops/registry.d.ts.map +1 -1
  125. package/types/pipelines.d.ts +31 -51
  126. package/types/pipelines.d.ts.map +1 -1
  127. package/types/tokenizers.d.ts +10 -6
  128. package/types/tokenizers.d.ts.map +1 -1
  129. package/types/utils/audio.d.ts.map +1 -1
  130. package/types/utils/constants.d.ts.map +1 -1
  131. package/types/utils/core.d.ts +87 -22
  132. package/types/utils/core.d.ts.map +1 -1
  133. package/types/utils/data-structures.d.ts.map +1 -1
  134. package/types/utils/devices.d.ts.map +1 -1
  135. package/types/utils/dtypes.d.ts.map +1 -1
  136. package/types/utils/generic.d.ts.map +1 -1
  137. package/types/utils/hub.d.ts +3 -3
  138. package/types/utils/hub.d.ts.map +1 -1
  139. package/types/utils/image.d.ts +10 -1
  140. package/types/utils/image.d.ts.map +1 -1
  141. package/types/utils/maths.d.ts +10 -10
  142. package/types/utils/maths.d.ts.map +1 -1
  143. package/types/utils/tensor.d.ts +22 -6
  144. package/types/utils/tensor.d.ts.map +1 -1
@@ -1,9 +1,8 @@
1
1
  import { Processor } from '../../base/processing_utils.js';
2
- import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
3
- import { max, softmax } from '../../utils/maths.js';
2
+ import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';
4
3
 
5
4
  export class PyAnnoteProcessor extends Processor {
6
- static feature_extractor_class = AutoFeatureExtractor
5
+ static feature_extractor_class = PyAnnoteFeatureExtractor
7
6
 
8
7
  /**
9
8
  * Calls the feature_extractor function with the given audio input.
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
14
13
  return await this.feature_extractor(audio)
15
14
  }
16
15
 
17
- /**
18
- * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
19
- * @param {number} samples The number of frames in the audio.
20
- * @returns {number} The number of frames in the audio.
21
- */
22
- samples_to_frames(samples) {
23
- return ((samples - this.config.offset) / this.config.step);
16
+ /** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
17
+ post_process_speaker_diarization(...args) {
18
+ return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
24
19
  }
25
20
 
26
- /**
27
- * Post-processes the speaker diarization logits output by the model.
28
- * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
29
- * @param {number} num_samples Number of samples in the input audio.
30
- * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
31
- */
32
- post_process_speaker_diarization(logits, num_samples) {
33
- const ratio = (
34
- num_samples / this.samples_to_frames(num_samples)
35
- ) / this.config.sampling_rate;
36
-
37
- const results = [];
38
- for (const scores of logits.tolist()) {
39
- const accumulated_segments = [];
40
-
41
- let current_speaker = -1;
42
- for (let i = 0; i < scores.length; ++i) {
43
- const probabilities = softmax(scores[i]);
44
- const [score, id] = max(probabilities);
45
- const [start, end] = [i, i + 1];
46
-
47
- if (id !== current_speaker) {
48
- // Speaker has changed
49
- current_speaker = id;
50
- accumulated_segments.push({ id, start, end, score });
51
- } else {
52
- // Continue the current segment
53
- accumulated_segments.at(-1).end = end;
54
- accumulated_segments.at(-1).score += score;
55
- }
56
- }
57
-
58
- results.push(accumulated_segments.map(
59
- // Convert frame-space to time-space
60
- // and compute the confidence
61
- ({ id, start, end, score }) => ({
62
- id,
63
- start: start * ratio,
64
- end: end * ratio,
65
- confidence: score / (end - start),
66
- })
67
- ));
68
- }
69
- return results;
21
+ get sampling_rate() {
22
+ return this.feature_extractor.config.sampling_rate;
70
23
  }
71
24
  }
package/src/models.js CHANGED
@@ -131,6 +131,7 @@ const MODEL_TYPES = {
131
131
  ImageTextToText: 6,
132
132
  Musicgen: 7,
133
133
  MultiModality: 8,
134
+ Phi3V: 9,
134
135
  }
135
136
  //////////////////////////////////////////////////
136
137
 
@@ -558,7 +559,9 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
558
559
  new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
559
560
  }
560
561
  if (session.inputNames.includes('position_ids') && new_model_inputs.attention_mask && !new_model_inputs.position_ids) {
561
- new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values);
562
+ // NOTE: Handle a special case for paligemma models, where positions are 1-indexed
563
+ const start_index = self.config.model_type === 'paligemma' ? 1 : 0;
564
+ new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
562
565
  }
563
566
 
564
567
  // Unpack the `past_key_values` object into model inputs
@@ -694,14 +697,14 @@ async function imageTextToTextForward(self, {
694
697
  * @param {Tensor} attention_mask
695
698
  * @returns {{data: BigInt64Array, dims: number[]}}
696
699
  */
697
- function cumsum_masked_fill(attention_mask) {
700
+ function cumsum_masked_fill(attention_mask, start_index = 0) {
698
701
  const [bz, seq_len] = attention_mask.dims;
699
702
  const attn_mask_data = attention_mask.data;
700
703
 
701
704
  const data = new BigInt64Array(attn_mask_data.length);
702
705
  for (let i = 0; i < bz; ++i) {
703
706
  const start = i * seq_len;
704
- let sum = BigInt(0);
707
+ let sum = BigInt(start_index);
705
708
  for (let j = 0; j < seq_len; ++j) {
706
709
  const index = start + j;
707
710
  if (attn_mask_data[index] === 0n) {
@@ -728,10 +731,10 @@ function cumsum_masked_fill(attention_mask) {
728
731
  * position_ids = position_ids[:, -input_ids.shape[1] :]
729
732
  * ```
730
733
  */
731
- function createPositionIds(model_inputs, past_key_values = null) {
734
+ function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
732
735
  const { input_ids, inputs_embeds, attention_mask } = model_inputs;
733
736
 
734
- const { data, dims } = cumsum_masked_fill(attention_mask);
737
+ const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
735
738
  let position_ids = new Tensor('int64', data, dims);
736
739
  if (past_key_values) {
737
740
  const offset = -(input_ids ?? inputs_embeds).dims.at(1);
@@ -904,6 +907,10 @@ export class PreTrainedModel extends Callable {
904
907
  this._forward = imageTextToTextForward;
905
908
  this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
906
909
  break;
910
+ case MODEL_TYPES.Phi3V:
911
+ this.can_generate = true;
912
+ this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
913
+ break;
907
914
 
908
915
  case MODEL_TYPES.MultiModality:
909
916
  this.can_generate = true;
@@ -1068,6 +1075,18 @@ export class PreTrainedModel extends Callable {
1068
1075
  }, options),
1069
1076
  ]);
1070
1077
 
1078
+ } else if (modelType === MODEL_TYPES.Phi3V) {
1079
+ info = await Promise.all([
1080
+ constructSessions(pretrained_model_name_or_path, {
1081
+ prepare_inputs_embeds: 'prepare_inputs_embeds',
1082
+ model: 'model',
1083
+ vision_encoder: 'vision_encoder',
1084
+ }, options),
1085
+ getOptionalConfigs(pretrained_model_name_or_path, {
1086
+ generation_config: 'generation_config.json',
1087
+ }, options),
1088
+ ]);
1089
+
1071
1090
  } else { // should be MODEL_TYPES.EncoderOnly
1072
1091
  if (modelType !== MODEL_TYPES.EncoderOnly) {
1073
1092
  const type = modelName ?? config?.model_type;
@@ -3340,6 +3359,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3340
3359
  }
3341
3360
  //////////////////////////////////////////////////
3342
3361
 
3362
+
3363
+ //////////////////////////////////////////////////
3364
+ // Moonshine models
3365
+ export class MoonshinePreTrainedModel extends PreTrainedModel {
3366
+
3367
+ requires_attention_mask = false;
3368
+ main_input_name = 'input_values';
3369
+ forward_params = [
3370
+ 'input_values',
3371
+ 'decoder_input_ids',
3372
+ 'past_key_values',
3373
+ ];
3374
+ };
3375
+
3376
+ /**
3377
+ * MoonshineModel class for training Moonshine models without a language model head.
3378
+ */
3379
+ export class MoonshineModel extends MoonshinePreTrainedModel { }
3380
+
3381
+ export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
3382
+ //////////////////////////////////////////////////
3383
+
3384
+
3343
3385
  //////////////////////////////////////////////////
3344
3386
  /**
3345
3387
  * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
@@ -3548,6 +3590,30 @@ export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel
3548
3590
  }
3549
3591
  }
3550
3592
 
3593
+ export class PaliGemmaPreTrainedModel extends PreTrainedModel {
3594
+ forward_params = [
3595
+ 'input_ids',
3596
+ // 'inputs_embeds',
3597
+ 'attention_mask',
3598
+ 'pixel_values',
3599
+ 'position_ids',
3600
+ 'past_key_values',
3601
+ ];
3602
+ }
3603
+
3604
+ export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
3605
+ _merge_input_ids_with_image_features(kwargs) {
3606
+ const vision_hidden_size = kwargs.image_features.dims.at(-1);
3607
+ const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
3608
+
3609
+ return default_merge_input_ids_with_image_features({
3610
+ // @ts-ignore
3611
+ image_token_id: this.config.image_token_index,
3612
+ ...kwargs,
3613
+ image_features: reshaped_image_hidden_states,
3614
+ })
3615
+ }
3616
+ }
3551
3617
 
3552
3618
  //////////////////////////////////////////////////
3553
3619
  // Idefics3 Models
@@ -3586,6 +3652,77 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
3586
3652
  }
3587
3653
  //////////////////////////////////////////////////
3588
3654
 
3655
+ export class Phi3VPreTrainedModel extends PreTrainedModel {
3656
+ forward_params = [
3657
+ 'input_ids',
3658
+ 'inputs_embeds',
3659
+ 'attention_mask',
3660
+ 'position_ids',
3661
+ 'pixel_values',
3662
+ 'image_sizes',
3663
+ 'past_key_values',
3664
+ ];
3665
+ }
3666
+ export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
3667
+
3668
+ async forward({
3669
+ // Produced by the tokenizer/processor:
3670
+ input_ids = null,
3671
+ attention_mask = null,
3672
+ pixel_values = null,
3673
+ image_sizes = null,
3674
+
3675
+ // Used during generation:
3676
+ position_ids = null,
3677
+ inputs_embeds = null,
3678
+ past_key_values = null,
3679
+
3680
+ // Generic generation parameters
3681
+ generation_config = null,
3682
+ logits_processor = null,
3683
+
3684
+ // TODO: needed?
3685
+ ...kwargs
3686
+ }) {
3687
+ if (!inputs_embeds) {
3688
+ let image_features;
3689
+ if (pixel_values && input_ids.dims[1] !== 1) {
3690
+ if (!image_sizes) {
3691
+ throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
3692
+ }
3693
+
3694
+ // Encode the image
3695
+ ({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
3696
+ pixel_values,
3697
+ image_sizes,
3698
+ }));
3699
+ } else {
3700
+ const hidden_size = this.config.normalized_config.hidden_size;
3701
+ image_features = new Tensor(
3702
+ 'float32',
3703
+ [],
3704
+ [0, hidden_size],
3705
+ );
3706
+ }
3707
+
3708
+ ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
3709
+ input_ids,
3710
+ image_features,
3711
+ }));
3712
+ }
3713
+
3714
+ const outputs = await decoderForward(this, {
3715
+ inputs_embeds,
3716
+ past_key_values,
3717
+ attention_mask,
3718
+ position_ids,
3719
+ generation_config,
3720
+ logits_processor,
3721
+ }, false);
3722
+ return outputs;
3723
+ }
3724
+ }
3725
+
3589
3726
  //////////////////////////////////////////////////
3590
3727
  export class CLIPPreTrainedModel extends PreTrainedModel { }
3591
3728
 
@@ -3640,9 +3777,11 @@ export class CLIPModel extends CLIPPreTrainedModel { }
3640
3777
  export class CLIPTextModel extends CLIPPreTrainedModel {
3641
3778
  /** @type {typeof PreTrainedModel.from_pretrained} */
3642
3779
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3643
- // Update default model file name if not provided
3644
- options.model_file_name ??= 'text_model';
3645
- return super.from_pretrained(pretrained_model_name_or_path, options);
3780
+ return super.from_pretrained(pretrained_model_name_or_path, {
3781
+ // Update default model file name if not provided
3782
+ model_file_name: 'text_model',
3783
+ ...options,
3784
+ });
3646
3785
  }
3647
3786
  }
3648
3787
 
@@ -3675,9 +3814,11 @@ export class CLIPTextModel extends CLIPPreTrainedModel {
3675
3814
  export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
3676
3815
  /** @type {typeof PreTrainedModel.from_pretrained} */
3677
3816
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3678
- // Update default model file name if not provided
3679
- options.model_file_name ??= 'text_model';
3680
- return super.from_pretrained(pretrained_model_name_or_path, options);
3817
+ return super.from_pretrained(pretrained_model_name_or_path, {
3818
+ // Update default model file name if not provided
3819
+ model_file_name: 'text_model',
3820
+ ...options,
3821
+ });
3681
3822
  }
3682
3823
  }
3683
3824
 
@@ -3687,9 +3828,11 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
3687
3828
  export class CLIPVisionModel extends CLIPPreTrainedModel {
3688
3829
  /** @type {typeof PreTrainedModel.from_pretrained} */
3689
3830
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3690
- // Update default model file name if not provided
3691
- options.model_file_name ??= 'vision_model';
3692
- return super.from_pretrained(pretrained_model_name_or_path, options);
3831
+ return super.from_pretrained(pretrained_model_name_or_path, {
3832
+ // Update default model file name if not provided
3833
+ model_file_name: 'vision_model',
3834
+ ...options,
3835
+ });
3693
3836
  }
3694
3837
  }
3695
3838
 
@@ -3722,9 +3865,11 @@ export class CLIPVisionModel extends CLIPPreTrainedModel {
3722
3865
  export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
3723
3866
  /** @type {typeof PreTrainedModel.from_pretrained} */
3724
3867
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3725
- // Update default model file name if not provided
3726
- options.model_file_name ??= 'vision_model';
3727
- return super.from_pretrained(pretrained_model_name_or_path, options);
3868
+ return super.from_pretrained(pretrained_model_name_or_path, {
3869
+ // Update default model file name if not provided
3870
+ model_file_name: 'vision_model',
3871
+ ...options,
3872
+ });
3728
3873
  }
3729
3874
  }
3730
3875
  //////////////////////////////////////////////////
@@ -3808,9 +3953,11 @@ export class SiglipModel extends SiglipPreTrainedModel { }
3808
3953
  export class SiglipTextModel extends SiglipPreTrainedModel {
3809
3954
  /** @type {typeof PreTrainedModel.from_pretrained} */
3810
3955
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3811
- // Update default model file name if not provided
3812
- options.model_file_name ??= 'text_model';
3813
- return super.from_pretrained(pretrained_model_name_or_path, options);
3956
+ return super.from_pretrained(pretrained_model_name_or_path, {
3957
+ // Update default model file name if not provided
3958
+ model_file_name: 'text_model',
3959
+ ...options,
3960
+ });
3814
3961
  }
3815
3962
  }
3816
3963
 
@@ -3843,9 +3990,11 @@ export class SiglipTextModel extends SiglipPreTrainedModel {
3843
3990
  export class SiglipVisionModel extends CLIPPreTrainedModel {
3844
3991
  /** @type {typeof PreTrainedModel.from_pretrained} */
3845
3992
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3846
- // Update default model file name if not provided
3847
- options.model_file_name ??= 'vision_model';
3848
- return super.from_pretrained(pretrained_model_name_or_path, options);
3993
+ return super.from_pretrained(pretrained_model_name_or_path, {
3994
+ // Update default model file name if not provided
3995
+ model_file_name: 'vision_model',
3996
+ ...options,
3997
+ });
3849
3998
  }
3850
3999
  }
3851
4000
  //////////////////////////////////////////////////
@@ -3900,18 +4049,22 @@ export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
3900
4049
  export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
3901
4050
  /** @type {typeof PreTrainedModel.from_pretrained} */
3902
4051
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3903
- // Update default model file name if not provided
3904
- options.model_file_name ??= 'text_model';
3905
- return super.from_pretrained(pretrained_model_name_or_path, options);
4052
+ return super.from_pretrained(pretrained_model_name_or_path, {
4053
+ // Update default model file name if not provided
4054
+ model_file_name: 'text_model',
4055
+ ...options,
4056
+ });
3906
4057
  }
3907
4058
  }
3908
4059
 
3909
4060
  export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
3910
4061
  /** @type {typeof PreTrainedModel.from_pretrained} */
3911
4062
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
3912
- // Update default model file name if not provided
3913
- options.model_file_name ??= 'vision_model';
3914
- return super.from_pretrained(pretrained_model_name_or_path, options);
4063
+ return super.from_pretrained(pretrained_model_name_or_path, {
4064
+ // Update default model file name if not provided
4065
+ model_file_name: 'vision_model',
4066
+ ...options,
4067
+ });
3915
4068
  }
3916
4069
  }
3917
4070
  //////////////////////////////////////////////////
@@ -4071,6 +4224,14 @@ export class LlamaForCausalLM extends LlamaPreTrainedModel { }
4071
4224
  //////////////////////////////////////////////////
4072
4225
 
4073
4226
 
4227
+ //////////////////////////////////////////////////
4228
+ // EXAONE models
4229
+ export class ExaonePreTrainedModel extends PreTrainedModel { }
4230
+ export class ExaoneModel extends ExaonePreTrainedModel { }
4231
+ export class ExaoneForCausalLM extends ExaonePreTrainedModel { }
4232
+ //////////////////////////////////////////////////
4233
+
4234
+
4074
4235
  //////////////////////////////////////////////////
4075
4236
  // MobileLLM models
4076
4237
  export class MobileLLMPreTrainedModel extends PreTrainedModel { }
@@ -4086,6 +4247,13 @@ export class OlmoModel extends OlmoPreTrainedModel { }
4086
4247
  export class OlmoForCausalLM extends OlmoPreTrainedModel { }
4087
4248
  //////////////////////////////////////////////////
4088
4249
 
4250
+ //////////////////////////////////////////////////
4251
+ // OLMo2 models
4252
+ export class Olmo2PreTrainedModel extends PreTrainedModel { }
4253
+ export class Olmo2Model extends Olmo2PreTrainedModel { }
4254
+ export class Olmo2ForCausalLM extends Olmo2PreTrainedModel { }
4255
+ //////////////////////////////////////////////////
4256
+
4089
4257
 
4090
4258
  //////////////////////////////////////////////////
4091
4259
  // Granite models
@@ -4502,6 +4670,20 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
4502
4670
  //////////////////////////////////////////////////
4503
4671
 
4504
4672
 
4673
+ //////////////////////////////////////////////////
4674
+ export class IJepaPreTrainedModel extends PreTrainedModel { }
4675
+ export class IJepaModel extends IJepaPreTrainedModel { }
4676
+ export class IJepaForImageClassification extends IJepaPreTrainedModel {
4677
+ /**
4678
+ * @param {any} model_inputs
4679
+ */
4680
+ async _call(model_inputs) {
4681
+ return new SequenceClassifierOutput(await super._call(model_inputs));
4682
+ }
4683
+ }
4684
+ //////////////////////////////////////////////////
4685
+
4686
+
4505
4687
  //////////////////////////////////////////////////
4506
4688
  export class VitPosePreTrainedModel extends PreTrainedModel { }
4507
4689
 
@@ -6112,9 +6294,11 @@ export class ClapModel extends ClapPreTrainedModel { }
6112
6294
  export class ClapTextModelWithProjection extends ClapPreTrainedModel {
6113
6295
  /** @type {typeof PreTrainedModel.from_pretrained} */
6114
6296
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
6115
- // Update default model file name if not provided
6116
- options.model_file_name ??= 'text_model';
6117
- return super.from_pretrained(pretrained_model_name_or_path, options);
6297
+ return super.from_pretrained(pretrained_model_name_or_path, {
6298
+ // Update default model file name if not provided
6299
+ model_file_name: 'text_model',
6300
+ ...options,
6301
+ });
6118
6302
  }
6119
6303
  }
6120
6304
 
@@ -6147,9 +6331,11 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel {
6147
6331
  export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
6148
6332
  /** @type {typeof PreTrainedModel.from_pretrained} */
6149
6333
  static async from_pretrained(pretrained_model_name_or_path, options = {}) {
6150
- // Update default model file name if not provided
6151
- options.model_file_name ??= 'audio_model';
6152
- return super.from_pretrained(pretrained_model_name_or_path, options);
6334
+ return super.from_pretrained(pretrained_model_name_or_path, {
6335
+ // Update default model file name if not provided
6336
+ model_file_name: 'audio_model',
6337
+ ...options,
6338
+ });
6153
6339
  }
6154
6340
  }
6155
6341
  //////////////////////////////////////////////////
@@ -6772,6 +6958,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
6772
6958
  ['rt_detr', ['RTDetrModel', RTDetrModel]],
6773
6959
  ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
6774
6960
  ['vit', ['ViTModel', ViTModel]],
6961
+ ['ijepa', ['IJepaModel', IJepaModel]],
6775
6962
  ['pvt', ['PvtModel', PvtModel]],
6776
6963
  ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
6777
6964
  ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
@@ -6835,7 +7022,9 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
6835
7022
  ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
6836
7023
  ['codegen', ['CodeGenModel', CodeGenModel]],
6837
7024
  ['llama', ['LlamaModel', LlamaModel]],
7025
+ ['exaone', ['ExaoneModel', ExaoneModel]],
6838
7026
  ['olmo', ['OlmoModel', OlmoModel]],
7027
+ ['olmo2', ['Olmo2Model', Olmo2Model]],
6839
7028
  ['mobilellm', ['MobileLLMModel', MobileLLMModel]],
6840
7029
  ['granite', ['GraniteModel', GraniteModel]],
6841
7030
  ['cohere', ['CohereModel', CohereModel]],
@@ -6856,6 +7045,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
6856
7045
  const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
6857
7046
  ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
6858
7047
  ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
7048
+ ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
6859
7049
  ]);
6860
7050
 
6861
7051
  const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
@@ -6926,7 +7116,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
6926
7116
  ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
6927
7117
  ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
6928
7118
  ['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
7119
+ ['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
6929
7120
  ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
7121
+ ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
6930
7122
  ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
6931
7123
  ['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
6932
7124
  ['cohere', ['CohereForCausalLM', CohereForCausalLM]],
@@ -6944,6 +7136,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
6944
7136
  ['falcon', ['FalconForCausalLM', FalconForCausalLM]],
6945
7137
  ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
6946
7138
  ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
7139
+
7140
+ // Also image-text-to-text
7141
+ ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
6947
7142
  ]);
6948
7143
 
6949
7144
  const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
@@ -7000,6 +7195,7 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
7000
7195
  ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
7001
7196
  ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
7002
7197
  ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
7198
+ ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
7003
7199
  ]);
7004
7200
 
7005
7201
  const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
@@ -7008,6 +7204,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
7008
7204
 
7009
7205
  const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
7010
7206
  ['vit', ['ViTForImageClassification', ViTForImageClassification]],
7207
+ ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
7011
7208
  ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
7012
7209
  ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
7013
7210
  ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
@@ -7179,6 +7376,7 @@ const CUSTOM_MAPPING = [
7179
7376
  // OVERRIDE:
7180
7377
  // TODO: Refactor to allow class to specify model
7181
7378
  ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
7379
+ ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
7182
7380
 
7183
7381
  ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
7184
7382
  ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
@@ -100,4 +100,15 @@ export class TensorOpRegistry {
100
100
  }
101
101
  return this._top_k;
102
102
  }
103
+
104
+ static get slice() {
105
+ if (!this._slice) {
106
+ this._slice = wrap(
107
+ [8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
108
+ this.session_options,
109
+ 'y',
110
+ )
111
+ }
112
+ return this._slice;
113
+ }
103
114
  }
package/src/pipelines.js CHANGED
@@ -1729,6 +1729,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1729
1729
  case 'unispeech-sat':
1730
1730
  case 'hubert':
1731
1731
  return this._call_wav2vec2(audio, kwargs)
1732
+ case 'moonshine':
1733
+ return this._call_moonshine(audio, kwargs)
1732
1734
  default:
1733
1735
  throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
1734
1736
  }
@@ -1882,6 +1884,34 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
1882
1884
  }
1883
1885
  return single ? toReturn[0] : toReturn;
1884
1886
  }
1887
+
1888
+ /**
1889
+ * @type {AutomaticSpeechRecognitionPipelineCallback}
1890
+ * @private
1891
+ */
1892
+ async _call_moonshine(audio, kwargs) {
1893
+ const single = !Array.isArray(audio);
1894
+ if (single) {
1895
+ audio = [/** @type {AudioInput} */ (audio)];
1896
+ }
1897
+ const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
1898
+ const preparedAudios = await prepareAudios(audio, sampling_rate);
1899
+ const toReturn = [];
1900
+ for (const aud of preparedAudios) {
1901
+ const inputs = await this.processor(aud);
1902
+
1903
+ // According to the [paper](https://arxiv.org/pdf/2410.15608):
1904
+ // "We use greedy decoding, with a heuristic limit of 6 output tokens
1905
+ // per second of audio to avoid repeated output sequences."
1906
+ const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
1907
+ const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
1908
+
1909
+ const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
1910
+ toReturn.push({ text });
1911
+ }
1912
+ return single ? toReturn[0] : toReturn;
1913
+ }
1914
+
1885
1915
  }
1886
1916
 
1887
1917
  /**
package/src/tokenizers.js CHANGED
@@ -2605,6 +2605,12 @@ export class PreTrainedTokenizer extends Callable {
2605
2605
  this.unk_token = this.getToken('unk_token');
2606
2606
  this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);
2607
2607
 
2608
+ this.bos_token = this.getToken('bos_token');
2609
+ this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);
2610
+
2611
+ this.eos_token = this.getToken('eos_token');
2612
+ this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);
2613
+
2608
2614
  this.model_max_length = tokenizerConfig.model_max_length;
2609
2615
 
2610
2616
  /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
@@ -3577,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
3577
3583
  let chunk = new_chunk();
3578
3584
  let time_offset = 0.0;
3579
3585
  const timestamp_begin = this.timestamp_begin;
3586
+ // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
3587
+ // We can calculate the last time stamp token as timestamp_begin plus the number of tokens
3588
+ // tokens from 0.00 to 30.00 which is 1500.
3589
+ const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
3590
+ const timestamp_end = timestamp_begin + total_timestamp_tokens;
3580
3591
 
3581
3592
  let previous_tokens = [];
3582
3593
  let previous_token_timestamps = [];
@@ -3664,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
3664
3675
  } else {
3665
3676
  // 2/ This is a regular special token, ignoring it
3666
3677
  }
3667
- } else if (token >= timestamp_begin) {
3678
+ } else if (token >= timestamp_begin && token <= timestamp_end) {
3668
3679
  // 3/ Timestamp token
3669
3680
  const time = (token - timestamp_begin) * time_precision + time_offset;
3670
3681
  const rounded_time = round(time, 2);