@huggingface/transformers 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. package/README.md +7 -3
  2. package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
  3. package/dist/transformers.cjs +965 -195
  4. package/dist/transformers.cjs.map +1 -1
  5. package/dist/transformers.js +2251 -1360
  6. package/dist/transformers.js.map +1 -1
  7. package/dist/transformers.min.cjs +1 -352
  8. package/dist/transformers.min.cjs.map +1 -1
  9. package/dist/transformers.min.js +1 -415
  10. package/dist/transformers.min.js.map +1 -1
  11. package/dist/transformers.min.mjs +1 -352
  12. package/dist/transformers.min.mjs.map +1 -1
  13. package/dist/transformers.mjs +979 -194
  14. package/dist/transformers.mjs.map +1 -1
  15. package/package.json +11 -16
  16. package/src/backends/onnx.js +2 -7
  17. package/src/configs.js +3 -1
  18. package/src/env.js +6 -6
  19. package/src/generation/configuration_utils.js +7 -0
  20. package/src/generation/logits_process.js +22 -16
  21. package/src/generation/streamers.js +7 -2
  22. package/src/models/idefics3/image_processing_idefics3.js +219 -0
  23. package/src/models/idefics3/processing_idefics3.js +136 -0
  24. package/src/models/image_processors.js +1 -0
  25. package/src/models/paligemma/processing_paligemma.js +82 -0
  26. package/src/models/processors.js +2 -0
  27. package/src/models.js +169 -39
  28. package/src/tokenizers.js +12 -1
  29. package/src/utils/core.js +53 -9
  30. package/src/utils/dtypes.js +2 -1
  31. package/src/utils/hub.js +8 -12
  32. package/src/utils/image.js +59 -16
  33. package/src/utils/tensor.js +6 -1
  34. package/types/backends/onnx.d.ts +2 -2
  35. package/types/backends/onnx.d.ts.map +1 -1
  36. package/types/base/feature_extraction_utils.d.ts +1 -1
  37. package/types/base/feature_extraction_utils.d.ts.map +1 -1
  38. package/types/base/image_processors_utils.d.ts +2 -2
  39. package/types/base/image_processors_utils.d.ts.map +1 -1
  40. package/types/base/processing_utils.d.ts +4 -4
  41. package/types/base/processing_utils.d.ts.map +1 -1
  42. package/types/configs.d.ts +7 -7
  43. package/types/configs.d.ts.map +1 -1
  44. package/types/env.d.ts +2 -2
  45. package/types/env.d.ts.map +1 -1
  46. package/types/generation/configuration_utils.d.ts +7 -1
  47. package/types/generation/configuration_utils.d.ts.map +1 -1
  48. package/types/generation/logits_process.d.ts +32 -22
  49. package/types/generation/logits_process.d.ts.map +1 -1
  50. package/types/generation/logits_sampler.d.ts.map +1 -1
  51. package/types/generation/parameters.d.ts +5 -5
  52. package/types/generation/stopping_criteria.d.ts +1 -1
  53. package/types/generation/stopping_criteria.d.ts.map +1 -1
  54. package/types/generation/streamers.d.ts +15 -10
  55. package/types/generation/streamers.d.ts.map +1 -1
  56. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
  57. package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
  58. package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
  59. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  60. package/types/models/auto/processing_auto.d.ts +1 -1
  61. package/types/models/auto/processing_auto.d.ts.map +1 -1
  62. package/types/models/clap/feature_extraction_clap.d.ts +1 -1
  63. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
  64. package/types/models/detr/image_processing_detr.d.ts +11 -11
  65. package/types/models/detr/image_processing_detr.d.ts.map +1 -1
  66. package/types/models/donut/image_processing_donut.d.ts +1 -1
  67. package/types/models/donut/image_processing_donut.d.ts.map +1 -1
  68. package/types/models/florence2/processing_florence2.d.ts.map +1 -1
  69. package/types/models/idefics3/image_processing_idefics3.d.ts +40 -0
  70. package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -0
  71. package/types/models/idefics3/processing_idefics3.d.ts +19 -0
  72. package/types/models/idefics3/processing_idefics3.d.ts.map +1 -0
  73. package/types/models/image_processors.d.ts +1 -0
  74. package/types/models/janus/image_processing_janus.d.ts +1 -1
  75. package/types/models/janus/image_processing_janus.d.ts.map +1 -1
  76. package/types/models/janus/processing_janus.d.ts.map +1 -1
  77. package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
  78. package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
  79. package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
  80. package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
  81. package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
  82. package/types/models/paligemma/processing_paligemma.d.ts +12 -0
  83. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
  84. package/types/models/processors.d.ts +2 -0
  85. package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
  86. package/types/models/pyannote/processing_pyannote.d.ts +1 -1
  87. package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
  88. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  89. package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
  90. package/types/models/sam/image_processing_sam.d.ts.map +1 -1
  91. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
  92. package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
  93. package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
  94. package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
  95. package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
  96. package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
  97. package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
  98. package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
  99. package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
  100. package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
  101. package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
  102. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
  103. package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
  104. package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
  105. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  106. package/types/models/whisper/generation_whisper.d.ts.map +1 -1
  107. package/types/models/whisper/processing_whisper.d.ts.map +1 -1
  108. package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
  109. package/types/models.d.ts +44 -10
  110. package/types/models.d.ts.map +1 -1
  111. package/types/ops/registry.d.ts.map +1 -1
  112. package/types/pipelines.d.ts +26 -51
  113. package/types/pipelines.d.ts.map +1 -1
  114. package/types/tokenizers.d.ts +10 -6
  115. package/types/tokenizers.d.ts.map +1 -1
  116. package/types/utils/audio.d.ts.map +1 -1
  117. package/types/utils/constants.d.ts.map +1 -1
  118. package/types/utils/core.d.ts +94 -22
  119. package/types/utils/core.d.ts.map +1 -1
  120. package/types/utils/data-structures.d.ts.map +1 -1
  121. package/types/utils/devices.d.ts.map +1 -1
  122. package/types/utils/dtypes.d.ts +3 -2
  123. package/types/utils/dtypes.d.ts.map +1 -1
  124. package/types/utils/generic.d.ts.map +1 -1
  125. package/types/utils/hub.d.ts +3 -3
  126. package/types/utils/hub.d.ts.map +1 -1
  127. package/types/utils/image.d.ts +14 -1
  128. package/types/utils/image.d.ts.map +1 -1
  129. package/types/utils/maths.d.ts +10 -10
  130. package/types/utils/maths.d.ts.map +1 -1
  131. package/types/utils/tensor.d.ts +10 -8
  132. package/types/utils/tensor.d.ts.map +1 -1
@@ -0,0 +1,82 @@
1
+ import { Processor } from "../../base/processing_utils.js";
2
+ import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
+ import { AutoTokenizer } from "../../tokenizers.js";
4
+
5
+ const IMAGE_TOKEN = "<image>";
6
+
7
+ function build_string_from_input(
8
+ prompt,
9
+ bos_token,
10
+ image_seq_len,
11
+ image_token,
12
+ num_images,
13
+ ) {
14
+ return `${image_token.repeat(image_seq_len * num_images)}${bos_token}${prompt}\n`
15
+ }
16
+
17
+ export class PaliGemmaProcessor extends Processor {
18
+ static tokenizer_class = AutoTokenizer
19
+ static image_processor_class = AutoImageProcessor
20
+ static uses_processor_config = false;
21
+
22
+ /**
23
+ * @typedef {import('../../utils/image.js').RawImage} RawImage
24
+ */
25
+
26
+ // `images` is required, `text` is optional
27
+ async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
28
+ if (!text) {
29
+ console.warn(
30
+ "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
31
+ )
32
+ text = ""
33
+ }
34
+
35
+ if (!Array.isArray(images)) {
36
+ images = [images]
37
+ }
38
+
39
+ if (!Array.isArray(text)) {
40
+ text = [text]
41
+ }
42
+
43
+ const bos_token = this.tokenizer.bos_token;
44
+ const image_seq_length = this.image_processor.config.image_seq_length;
45
+ let input_strings;
46
+ if (text.some((t) => t.includes(IMAGE_TOKEN))) {
47
+ input_strings = text.map(
48
+ sample => {
49
+ const expanded_sample = sample.replaceAll(IMAGE_TOKEN, IMAGE_TOKEN.repeat(image_seq_length));
50
+ const bos_rfind_index = expanded_sample.lastIndexOf(IMAGE_TOKEN);
51
+ const bos_index = bos_rfind_index === -1 ? 0 : bos_rfind_index + IMAGE_TOKEN.length;
52
+ return expanded_sample.slice(0, bos_index) + bos_token + expanded_sample.slice(bos_index) + "\n";
53
+ }
54
+ )
55
+ } else {
56
+ console.warn(
57
+ "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " +
58
+ "image tokens in the text, as many tokens as there are images per each text. It is recommended to " +
59
+ "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images " +
60
+ "each text has and add special tokens."
61
+ )
62
+
63
+ input_strings = text.map(
64
+ sample => build_string_from_input(
65
+ sample,
66
+ bos_token,
67
+ image_seq_length,
68
+ IMAGE_TOKEN,
69
+ images.length,
70
+ )
71
+ )
72
+ }
73
+
74
+ const text_inputs = this.tokenizer(input_strings, kwargs);
75
+ const image_inputs = await this.image_processor(images, kwargs);
76
+
77
+ return {
78
+ ...image_inputs,
79
+ ...text_inputs,
80
+ }
81
+ }
82
+ }
@@ -1,8 +1,10 @@
1
1
  export * from './florence2/processing_florence2.js';
2
2
  export * from './mgp_str/processing_mgp_str.js';
3
+ export * from './idefics3/processing_idefics3.js';
3
4
  export * from './janus/processing_janus.js';
4
5
  export * from './jina_clip/processing_jina_clip.js';
5
6
  export * from './owlvit/processing_owlvit.js';
7
+ export * from './paligemma/processing_paligemma.js';
6
8
  export * from './pyannote/processing_pyannote.js';
7
9
  export * from './qwen2_vl/processing_qwen2_vl.js';
8
10
  export * from './sam/processing_sam.js';
package/src/models.js CHANGED
@@ -182,6 +182,22 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
182
182
  }
183
183
  }
184
184
 
185
+ if (dtype === DATA_TYPES.auto) {
186
+ // Try to choose the auto dtype based on the custom config
187
+ let config_dtype = custom_config.dtype;
188
+ if (typeof config_dtype !== 'string') {
189
+ config_dtype = config_dtype[fileName];
190
+ }
191
+
192
+ if (config_dtype && config_dtype !== DATA_TYPES.auto && DATA_TYPES.hasOwnProperty(config_dtype)) {
193
+ // Defined by the custom config, and is not "auto"
194
+ dtype = config_dtype;
195
+ } else {
196
+ // Choose default dtype based on device, falling back to fp32
197
+ dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32;
198
+ }
199
+ }
200
+
185
201
  const selectedDtype = /** @type {import("./utils/dtypes.js").DataType} */(dtype);
186
202
 
187
203
  if (!DEFAULT_DTYPE_SUFFIX_MAPPING.hasOwnProperty(selectedDtype)) {
@@ -387,9 +403,17 @@ async function sessionRun(session, inputs) {
387
403
  output = replaceTensors(output);
388
404
  return output;
389
405
  } catch (e) {
406
+ // Error messages can be long (nested) and uninformative. For this reason,
407
+ // we apply minor formatting to show the most important information
408
+ const formatted = Object.fromEntries(Object.entries(checkedInputs)
409
+ .map(([k, { type, dims, data }]) => [k, {
410
+ // Extract these properties from the underlying ORT tensor
411
+ type, dims, data,
412
+ }]));
413
+
390
414
  // This usually occurs when the inputs are of the wrong type.
391
415
  console.error(`An error occurred during model execution: "${e}".`);
392
- console.error('Inputs given to model:', checkedInputs)
416
+ console.error('Inputs given to model:', formatted);
393
417
  throw e;
394
418
  }
395
419
  }
@@ -534,7 +558,9 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
534
558
  new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
535
559
  }
536
560
  if (session.inputNames.includes('position_ids') && new_model_inputs.attention_mask && !new_model_inputs.position_ids) {
537
- new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values);
561
+ // NOTE: Handle a special case for paligemma models, where positions are 1-indexed
562
+ const start_index = self.config.model_type === 'paligemma' ? 1 : 0;
563
+ new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
538
564
  }
539
565
 
540
566
  // Unpack the `past_key_values` object into model inputs
@@ -546,6 +572,39 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
546
572
  }
547
573
 
548
574
 
575
+
576
+ function default_merge_input_ids_with_image_features({
577
+ image_token_id,
578
+ inputs_embeds,
579
+ image_features,
580
+ input_ids,
581
+ attention_mask,
582
+ }) {
583
+ const image_tokens = input_ids.tolist().map(ids =>
584
+ ids.reduce((acc, x, idx) => {
585
+ if (x == image_token_id) acc.push(idx);
586
+ return acc;
587
+ }, [])
588
+ );
589
+ const n_image_tokens = image_tokens.reduce((acc, x) => acc + x.length, 0);
590
+ const n_image_features = image_features.dims[0];
591
+ if (n_image_tokens !== n_image_features) {
592
+ throw new Error(`Image features and image tokens do not match: tokens: ${n_image_tokens}, features ${n_image_features}`);
593
+ }
594
+
595
+ // Equivalent to performing a masked_scatter
596
+ let img = 0;
597
+ for (let i = 0; i < image_tokens.length; ++i) {
598
+ const tokens = image_tokens[i];
599
+ const embeds = inputs_embeds[i];
600
+ for (let j = 0; j < tokens.length; ++j) {
601
+ embeds[tokens[j]].data.set(image_features[img++].data)
602
+ }
603
+ }
604
+ return { inputs_embeds, attention_mask }
605
+ }
606
+
607
+
549
608
  /**
550
609
  * Forward pass of an image-text-to-text model.
551
610
  * @param {Object} self The image-text-to-text model model.
@@ -637,14 +696,14 @@ async function imageTextToTextForward(self, {
637
696
  * @param {Tensor} attention_mask
638
697
  * @returns {{data: BigInt64Array, dims: number[]}}
639
698
  */
640
- function cumsum_masked_fill(attention_mask) {
699
+ function cumsum_masked_fill(attention_mask, start_index = 0) {
641
700
  const [bz, seq_len] = attention_mask.dims;
642
701
  const attn_mask_data = attention_mask.data;
643
702
 
644
703
  const data = new BigInt64Array(attn_mask_data.length);
645
704
  for (let i = 0; i < bz; ++i) {
646
705
  const start = i * seq_len;
647
- let sum = BigInt(0);
706
+ let sum = BigInt(start_index);
648
707
  for (let j = 0; j < seq_len; ++j) {
649
708
  const index = start + j;
650
709
  if (attn_mask_data[index] === 0n) {
@@ -671,10 +730,10 @@ function cumsum_masked_fill(attention_mask) {
671
730
  * position_ids = position_ids[:, -input_ids.shape[1] :]
672
731
  * ```
673
732
  */
674
- function createPositionIds(model_inputs, past_key_values = null) {
733
+ function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
675
734
  const { input_ids, inputs_embeds, attention_mask } = model_inputs;
676
735
 
677
- const { data, dims } = cumsum_masked_fill(attention_mask);
736
+ const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
678
737
  let position_ids = new Tensor('int64', data, dims);
679
738
  if (past_key_values) {
680
739
  const offset = -(input_ids ?? inputs_embeds).dims.at(1);
@@ -1013,7 +1072,10 @@ export class PreTrainedModel extends Callable {
1013
1072
 
1014
1073
  } else { // should be MODEL_TYPES.EncoderOnly
1015
1074
  if (modelType !== MODEL_TYPES.EncoderOnly) {
1016
- console.warn(`Model type for '${modelName ?? config?.model_type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`)
1075
+ const type = modelName ?? config?.model_type;
1076
+ if (type !== 'custom') {
1077
+ console.warn(`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`)
1078
+ }
1017
1079
  }
1018
1080
  info = await Promise.all([
1019
1081
  constructSessions(pretrained_model_name_or_path, {
@@ -1757,7 +1819,7 @@ export class PreTrainedModel extends Callable {
1757
1819
  const dtype = session?.config?.kv_cache_dtype ?? 'float32';
1758
1820
  const empty = (dtype === 'float16') ? new Uint16Array() : [];
1759
1821
 
1760
- const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask).dims?.[0] ?? 1;
1822
+ const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1;
1761
1823
  const shapes = getKeyValueShapes(this.config, { batch_size });
1762
1824
 
1763
1825
  for (const name in shapes) {
@@ -3304,8 +3366,8 @@ export class VisionEncoderDecoderModel extends PreTrainedModel {
3304
3366
  export class LlavaPreTrainedModel extends PreTrainedModel {
3305
3367
  forward_params = [
3306
3368
  'input_ids',
3307
- 'pixel_values',
3308
3369
  'attention_mask',
3370
+ 'pixel_values',
3309
3371
  'position_ids',
3310
3372
  'past_key_values',
3311
3373
  ];
@@ -3487,6 +3549,70 @@ export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel
3487
3549
  return decoder_outputs;
3488
3550
  }
3489
3551
  }
3552
+
3553
+ export class PaliGemmaPreTrainedModel extends PreTrainedModel {
3554
+ forward_params = [
3555
+ 'input_ids',
3556
+ // 'inputs_embeds',
3557
+ 'attention_mask',
3558
+ 'pixel_values',
3559
+ 'position_ids',
3560
+ 'past_key_values',
3561
+ ];
3562
+ }
3563
+
3564
+ export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
3565
+ _merge_input_ids_with_image_features(kwargs) {
3566
+ const vision_hidden_size = kwargs.image_features.dims.at(-1);
3567
+ const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
3568
+
3569
+ return default_merge_input_ids_with_image_features({
3570
+ // @ts-ignore
3571
+ image_token_id: this.config.image_token_index,
3572
+ ...kwargs,
3573
+ image_features: reshaped_image_hidden_states,
3574
+ })
3575
+ }
3576
+ }
3577
+
3578
+ //////////////////////////////////////////////////
3579
+ // Idefics3 Models
3580
+ export class Idefics3PreTrainedModel extends PreTrainedModel {
3581
+ forward_params = [
3582
+ 'input_ids',
3583
+ 'attention_mask',
3584
+ 'pixel_values',
3585
+ 'pixel_attention_mask',
3586
+ 'position_ids',
3587
+ 'past_key_values',
3588
+ ];
3589
+ }
3590
+
3591
+ /**
3592
+ * The LLAVA model which consists of a vision backbone and a language model.
3593
+ */
3594
+ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
3595
+
3596
+ async encode_image({ pixel_values, pixel_attention_mask }) {
3597
+ const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask })).image_features;
3598
+ return features;
3599
+ }
3600
+
3601
+ _merge_input_ids_with_image_features(kwargs) {
3602
+ const vision_hidden_size = kwargs.image_features.dims.at(-1);
3603
+ const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
3604
+
3605
+ return default_merge_input_ids_with_image_features({
3606
+ // @ts-ignore
3607
+ image_token_id: this.config.image_token_id,
3608
+ ...kwargs,
3609
+ image_features: reshaped_image_hidden_states,
3610
+ })
3611
+ }
3612
+ }
3613
+ //////////////////////////////////////////////////
3614
+
3615
+ //////////////////////////////////////////////////
3490
3616
  export class CLIPPreTrainedModel extends PreTrainedModel { }
3491
3617
 
3492
3618
  /**
@@ -3986,6 +4112,13 @@ export class OlmoModel extends OlmoPreTrainedModel { }
3986
4112
  export class OlmoForCausalLM extends OlmoPreTrainedModel { }
3987
4113
  //////////////////////////////////////////////////
3988
4114
 
4115
+ //////////////////////////////////////////////////
4116
+ // OLMo2 models
4117
+ export class Olmo2PreTrainedModel extends PreTrainedModel { }
4118
+ export class Olmo2Model extends Olmo2PreTrainedModel { }
4119
+ export class Olmo2ForCausalLM extends Olmo2PreTrainedModel { }
4120
+ //////////////////////////////////////////////////
4121
+
3989
4122
 
3990
4123
  //////////////////////////////////////////////////
3991
4124
  // Granite models
@@ -4280,36 +4413,12 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
4280
4413
  return features;
4281
4414
  }
4282
4415
 
4283
- _merge_input_ids_with_image_features({
4284
- inputs_embeds,
4285
- image_features,
4286
- input_ids,
4287
- attention_mask,
4288
- }) {
4289
- // @ts-ignore
4290
- const { image_token_id } = this.config;
4291
- const image_tokens = input_ids.tolist().map(ids =>
4292
- ids.reduce((acc, x, idx) => {
4293
- if (x == image_token_id) acc.push(idx);
4294
- return acc;
4295
- }, [])
4296
- );
4297
- const n_image_tokens = image_tokens.reduce((acc, x) => acc + x.length, 0);
4298
- const n_image_features = image_features.dims[0];
4299
- if (n_image_tokens !== n_image_features) {
4300
- throw new Error(`Image features and image tokens do not match: tokens: ${n_image_tokens}, features ${n_image_features}`);
4301
- }
4302
-
4303
- // Equivalent to performing a masked_scatter
4304
- let img = 0;
4305
- for (let i = 0; i < image_tokens.length; ++i) {
4306
- const tokens = image_tokens[i];
4307
- const embeds = inputs_embeds[i];
4308
- for (let j = 0; j < tokens.length; ++j) {
4309
- embeds[tokens[j]].data.set(image_features[img++].data)
4310
- }
4311
- }
4312
- return { inputs_embeds, attention_mask }
4416
+ _merge_input_ids_with_image_features(kwargs) {
4417
+ return default_merge_input_ids_with_image_features({
4418
+ // @ts-ignore
4419
+ image_token_id: this.config.image_token_id,
4420
+ ...kwargs
4421
+ })
4313
4422
  }
4314
4423
 
4315
4424
  prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
@@ -4426,6 +4535,20 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
4426
4535
  //////////////////////////////////////////////////
4427
4536
 
4428
4537
 
4538
+ //////////////////////////////////////////////////
4539
+ export class IJepaPreTrainedModel extends PreTrainedModel { }
4540
+ export class IJepaModel extends IJepaPreTrainedModel { }
4541
+ export class IJepaForImageClassification extends IJepaPreTrainedModel {
4542
+ /**
4543
+ * @param {any} model_inputs
4544
+ */
4545
+ async _call(model_inputs) {
4546
+ return new SequenceClassifierOutput(await super._call(model_inputs));
4547
+ }
4548
+ }
4549
+ //////////////////////////////////////////////////
4550
+
4551
+
4429
4552
  //////////////////////////////////////////////////
4430
4553
  export class VitPosePreTrainedModel extends PreTrainedModel { }
4431
4554
 
@@ -6696,6 +6819,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
6696
6819
  ['rt_detr', ['RTDetrModel', RTDetrModel]],
6697
6820
  ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
6698
6821
  ['vit', ['ViTModel', ViTModel]],
6822
+ ['ijepa', ['IJepaModel', IJepaModel]],
6699
6823
  ['pvt', ['PvtModel', PvtModel]],
6700
6824
  ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
6701
6825
  ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
@@ -6760,6 +6884,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
6760
6884
  ['codegen', ['CodeGenModel', CodeGenModel]],
6761
6885
  ['llama', ['LlamaModel', LlamaModel]],
6762
6886
  ['olmo', ['OlmoModel', OlmoModel]],
6887
+ ['olmo2', ['Olmo2Model', Olmo2Model]],
6763
6888
  ['mobilellm', ['MobileLLMModel', MobileLLMModel]],
6764
6889
  ['granite', ['GraniteModel', GraniteModel]],
6765
6890
  ['cohere', ['CohereModel', CohereModel]],
@@ -6851,6 +6976,7 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
6851
6976
  ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
6852
6977
  ['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
6853
6978
  ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
6979
+ ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
6854
6980
  ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
6855
6981
  ['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
6856
6982
  ['cohere', ['CohereForCausalLM', CohereForCausalLM]],
@@ -6914,6 +7040,7 @@ const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
6914
7040
 
6915
7041
  const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([
6916
7042
  ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
7043
+ ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
6917
7044
  ]);
6918
7045
 
6919
7046
  const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
@@ -6922,6 +7049,8 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
6922
7049
  ['moondream1', ['Moondream1ForConditionalGeneration', Moondream1ForConditionalGeneration]],
6923
7050
  ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
6924
7051
  ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
7052
+ ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
7053
+ ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
6925
7054
  ]);
6926
7055
 
6927
7056
  const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
@@ -6930,6 +7059,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
6930
7059
 
6931
7060
  const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
6932
7061
  ['vit', ['ViTForImageClassification', ViTForImageClassification]],
7062
+ ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
6933
7063
  ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
6934
7064
  ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
6935
7065
  ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
package/src/tokenizers.js CHANGED
@@ -2605,6 +2605,12 @@ export class PreTrainedTokenizer extends Callable {
2605
2605
  this.unk_token = this.getToken('unk_token');
2606
2606
  this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);
2607
2607
 
2608
+ this.bos_token = this.getToken('bos_token');
2609
+ this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);
2610
+
2611
+ this.eos_token = this.getToken('eos_token');
2612
+ this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);
2613
+
2608
2614
  this.model_max_length = tokenizerConfig.model_max_length;
2609
2615
 
2610
2616
  /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
@@ -3577,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
3577
3583
  let chunk = new_chunk();
3578
3584
  let time_offset = 0.0;
3579
3585
  const timestamp_begin = this.timestamp_begin;
3586
+ // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
3587
+ // We can calculate the last time stamp token as timestamp_begin plus the number of tokens
3588
+ // tokens from 0.00 to 30.00 which is 1500.
3589
+ const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
3590
+ const timestamp_end = timestamp_begin + total_timestamp_tokens;
3580
3591
 
3581
3592
  let previous_tokens = [];
3582
3593
  let previous_token_timestamps = [];
@@ -3664,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
3664
3675
  } else {
3665
3676
  // 2/ This is a regular special token, ignoring it
3666
3677
  }
3667
- } else if (token >= timestamp_begin) {
3678
+ } else if (token >= timestamp_begin && token <= timestamp_end) {
3668
3679
  // 3/ Timestamp token
3669
3680
  const time = (token - timestamp_begin) * time_precision + time_offset;
3670
3681
  const rounded_time = round(time, 2);
package/src/utils/core.js CHANGED
@@ -9,15 +9,45 @@
9
9
  */
10
10
 
11
11
  /**
12
- * @typedef {Object} ProgressInfo
13
- * @property {'initiate' | 'download' | 'progress' | 'done'} status The status of the progress item.
14
- * @property {string} name This can be either:
15
- * - a string, the *model id* of a model repo on huggingface.co.
16
- * - a path to a *directory* potentially containing the file.
17
- * @property {string} file The name of the file
18
- * @property {number} [progress] A number between 0 and 100. Only available for the 'progress' status.
19
- * @property {number} [loaded] The number of bytes loaded. Only available for the 'progress' status.
20
- * @property {number} [total] The total number of bytes to be loaded. Only available for the 'progress' status.
12
+ * @typedef {Object} InitiateProgressInfo
13
+ * @property {'initiate'} status
14
+ * @property {string} name The model id or directory path.
15
+ * @property {string} file The name of the file.
16
+ */
17
+
18
+ /**
19
+ * @typedef {Object} DownloadProgressInfo
20
+ * @property {'download'} status
21
+ * @property {string} name The model id or directory path.
22
+ * @property {string} file The name of the file.
23
+ */
24
+
25
+ /**
26
+ * @typedef {Object} ProgressStatusInfo
27
+ * @property {'progress'} status
28
+ * @property {string} name The model id or directory path.
29
+ * @property {string} file The name of the file.
30
+ * @property {number} progress A number between 0 and 100.
31
+ * @property {number} loaded The number of bytes loaded.
32
+ * @property {number} total The total number of bytes to be loaded.
33
+ */
34
+
35
+ /**
36
+ * @typedef {Object} DoneProgressInfo
37
+ * @property {'done'} status
38
+ * @property {string} name The model id or directory path.
39
+ * @property {string} file The name of the file.
40
+ */
41
+
42
+ /**
43
+ * @typedef {Object} ReadyProgressInfo
44
+ * @property {'ready'} status
45
+ * @property {string} task The loaded task.
46
+ * @property {string} model The loaded model.
47
+ */
48
+
49
+ /**
50
+ * @typedef {InitiateProgressInfo | DownloadProgressInfo | ProgressStatusInfo | DoneProgressInfo | ReadyProgressInfo} ProgressInfo
21
51
  */
22
52
 
23
53
  /**
@@ -187,3 +217,17 @@ export function len(s) {
187
217
  for (const c of s) ++length;
188
218
  return length;
189
219
  }
220
+
221
+ /**
222
+ * Count the occurrences of a value in an array or string.
223
+ * This mimics the behavior of Python's `count` method.
224
+ * @param {any[]|string} arr The array or string to search.
225
+ * @param {any} value The value to count.
226
+ */
227
+ export function count(arr, value) {
228
+ let count = 0;
229
+ for (const v of arr) {
230
+ if (v === value) ++count;
231
+ }
232
+ return count;
233
+ }
@@ -31,6 +31,7 @@ export const isWebGpuFp16Supported = (function () {
31
31
  })();
32
32
 
33
33
  export const DATA_TYPES = Object.freeze({
34
+ auto: 'auto', // Auto-detect based on environment
34
35
  fp32: 'fp32',
35
36
  fp16: 'fp16',
36
37
  q8: 'q8',
@@ -47,7 +48,7 @@ export const DEFAULT_DEVICE_DTYPE_MAPPING = Object.freeze({
47
48
  [DEVICE_TYPES.wasm]: DATA_TYPES.q8,
48
49
  });
49
50
 
50
- /** @type {Record<DataType, string>} */
51
+ /** @type {Record<Exclude<DataType, "auto">, string>} */
51
52
  export const DEFAULT_DTYPE_SUFFIX_MAPPING = Object.freeze({
52
53
  [DATA_TYPES.fp32]: '',
53
54
  [DATA_TYPES.fp16]: '_fp16',
package/src/utils/hub.js CHANGED
@@ -504,13 +504,6 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
504
504
  file: filename
505
505
  })
506
506
 
507
- /** @type {import('./core.js').ProgressInfo} */
508
- const progressInfo = {
509
- status: 'progress',
510
- name: path_or_repo_id,
511
- file: filename
512
- }
513
-
514
507
  /** @type {Uint8Array} */
515
508
  let buffer;
516
509
 
@@ -530,7 +523,9 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
530
523
 
531
524
  // For completeness, we still fire the final progress callback
532
525
  dispatchCallback(options.progress_callback, {
533
- ...progressInfo,
526
+ status: 'progress',
527
+ name: path_or_repo_id,
528
+ file: filename,
534
529
  progress: 100,
535
530
  loaded: buffer.length,
536
531
  total: buffer.length,
@@ -538,7 +533,9 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
538
533
  } else {
539
534
  buffer = await readResponse(response, data => {
540
535
  dispatchCallback(options.progress_callback, {
541
- ...progressInfo,
536
+ status: 'progress',
537
+ name: path_or_repo_id,
538
+ file: filename,
542
539
  ...data,
543
540
  })
544
541
  })
@@ -595,12 +592,11 @@ export async function getModelJSON(modelPath, fileName, fatal = true, options =
595
592
 
596
593
  return JSON.parse(jsonData);
597
594
  }
598
-
599
595
  /**
600
596
  * Read and track progress when reading a Response object
601
597
  *
602
- * @param {any} response The Response object to read
603
- * @param {function} progress_callback The function to call with progress updates
598
+ * @param {Response|FileResponse} response The Response object to read
599
+ * @param {(data: {progress: number, loaded: number, total: number}) => void} progress_callback The function to call with progress updates
604
600
  * @returns {Promise<Uint8Array>} A Promise that resolves with the Uint8Array buffer
605
601
  */
606
602
  async function readResponse(response, progress_callback) {